Revert D25399466: add channels last support for AvgPool2d on CPU

Test Plan: revert-hammer

Differential Revision:
D25399466 (8ac0917cc7)

Original commit changeset: 9477b0c281c0

fbshipit-source-id: e0245f0e390f5eca228445fd82d6e5142a827abc
This commit is contained in:
Vitaly Fedyunin
2021-05-14 12:42:43 -07:00
committed by Facebook GitHub Bot
parent 0caec739a3
commit 49a8942a77
5 changed files with 241 additions and 464 deletions

View File

@ -1,6 +1,8 @@
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/Pool.h>
#include <tuple>
namespace at {
@ -8,9 +10,98 @@ namespace native {
namespace {
template <typename scalar_t>
static void avg_pool2d_out_frame(
scalar_t *input_data,
scalar_t *output_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for (auto k = start; k < end; k++)
{
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t p;
for(p = 0; p < nbatch; p++)
{
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t xx, yy;
/* For all output pixels... */
scalar_t *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
const scalar_t *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t i;
for(i = 0; i < outputWidth*outputHeight; i++)
ptr_output[i] = 0;
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
/* Compute the mean of the input image... */
int64_t hstart = yy * dH - padH;
int64_t wstart = xx * dW - padW;
int64_t hend = std::min(hstart + kH, inputHeight + padH);
int64_t wend = std::min(wstart + kW, inputWidth + padW);
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int pool_size = (hend - hstart) * (wend - wstart);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
hend = std::min(hend, inputHeight);
wend = std::min(wend, inputWidth);
if (hstart >= hend || wstart >= wend) {
++ptr_output;
continue;
}
scalar_t sum = 0;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
divide_factor = (hend - hstart) * (wend - wstart);
}
}
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t kx, ky;
for(ky = hstart; ky < hend; ky++)
{
for(kx = wstart; kx < wend; kx++)
sum += ptr_input[ky*inputWidth + kx];
}
/* Update output */
*ptr_output++ += sum/divide_factor;
}
}
}
}
});
}
void avg_pool2d_out_cpu_template(
Tensor &output,
const Tensor &input,
const Tensor &input_,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
@ -38,41 +129,140 @@ void avg_pool2d_out_cpu_template(
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
"divisor must be not zero");
TORCH_CHECK(input.dtype() == output.dtype(),
"expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());
/* sizes */
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
const int64_t nInputPlane = input.size(-3);
const int64_t inputHeight = input.size(-2);
const int64_t inputWidth = input.size(-1);
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
const int64_t nInputPlane = input_.size(-3);
const int64_t inputHeight = input_.size(-2);
const int64_t inputWidth = input_.size(-1);
const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
pool2d_shape_check(
input,
input_,
kH, kW, dH, dW, padH, padW, 1, 1,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth, input.suggest_memory_format());
outputHeight, outputWidth, input_.suggest_memory_format());
if (input.ndimension() == 3) {
if (input_.ndimension() == 3) {
output.resize_({nInputPlane, outputHeight, outputWidth});
}
else {
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
}
avg_pool2d_kernel(
kCPU, output, input,
kW, kH, dW, dH, padW, padH,
count_include_pad, divisor_override);
TORCH_CHECK(output.is_contiguous(), "avg_pool2d: output must be contiguous");
Tensor input = input_.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
"avg_pool2d_out_frame",
[&] {
scalar_t *input_data = input.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
avg_pool2d_out_frame(
input_data,
output_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH,
dW, dH,
padW, padH,
count_include_pad,
divisor_override);
}
);
}
template <typename scalar_t>
static void avg_pool2d_backward_out_frame(
scalar_t *gradInput_data,
scalar_t *gradOutput_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for (auto k = start; k < end; k++)
{
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t p;
for(p = 0; p < nbatch; p++)
{
const scalar_t *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t xx, yy;
scalar_t* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
scalar_t *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t i;
for(i=0; i<inputWidth*inputHeight; i++)
ptr_gi[i] = 0.0;
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
int64_t hstart = yy * dH - padH;
int64_t wstart = xx * dW - padW;
int64_t hend = std::min(hstart + kH, inputHeight + padH);
int64_t wend = std::min(wstart + kW, inputWidth + padW);
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int pool_size = (hend - hstart) * (wend - wstart);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
hend = std::min(hend, inputHeight);
wend = std::min(wend, inputWidth);
scalar_t z = *ptr_gradOutput++;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
divide_factor = (hend - hstart) * (wend - wstart);
}
}
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t kx, ky;
for(ky = hstart ; ky < hend; ky++)
{
for(kx = wstart; kx < wend; kx++)
ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
}
}
}
}
}
});
}
Tensor& avg_pool2d_backward_out_cpu_template(
Tensor& gradInput,
const Tensor& gradOutput,
const Tensor& gradOutput_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
@ -102,11 +292,6 @@ Tensor& avg_pool2d_backward_out_cpu_template(
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0, "divisor must be not zero");
TORCH_CHECK(input.dtype() == gradOutput.dtype(),
"expected dtype ", input.dtype(), " for `gradOutput` but got dtype ", gradOutput.dtype());
TORCH_CHECK(input.dtype() == gradInput.dtype(),
"expected dtype ", input.dtype(), " for `gradInput` but got dtype ", gradInput.dtype());
/* sizes */
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
const int64_t nInputPlane = input.size(-3); // number of channels (or colors)
@ -117,7 +302,7 @@ Tensor& avg_pool2d_backward_out_cpu_template(
avg_pool2d_backward_shape_check(
input,
gradOutput,
gradOutput_,
nbatch,
kH, kW, dH, dW, padH, padW,
nInputPlane,
@ -125,14 +310,34 @@ Tensor& avg_pool2d_backward_out_cpu_template(
outputHeight, outputWidth,
input.suggest_memory_format());
/* resize */
gradInput.resize_(input.sizes(), input.suggest_memory_format());
gradInput.zero_();
/* get contiguous gradOutput */
const Tensor gradOutput = gradOutput_.contiguous();
avg_pool2d_backward_kernel(
kCPU, gradInput, gradOutput,
kW, kH, dW, dH, padW, padH,
count_include_pad, divisor_override);
/* resize */
gradInput.resize_as_(input);
gradInput.zero_();
TORCH_CHECK(gradInput.is_contiguous(), "gradInput must be contiguous");
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
"avg_pool2d_backward_out_frame",
[&] {
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
avg_pool2d_backward_out_frame(
gradInput_data,
gradOutput_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH,
dW, dH,
padW, padH,
count_include_pad,
divisor_override);
}
);
return gradInput;
}
@ -182,8 +387,7 @@ Tensor avg_pool2d_cpu(
return output;
}
Tensor& avg_pool2d_backward_out_cpu(
const Tensor& gradOutput,
Tensor& avg_pool2d_backward_out_cpu(const Tensor& gradOutput_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
@ -195,7 +399,7 @@ Tensor& avg_pool2d_backward_out_cpu(
{
avg_pool2d_backward_out_cpu_template(
gradInput,
gradOutput,
gradOutput_,
input,
kernel_size,
stride,
@ -207,7 +411,7 @@ Tensor& avg_pool2d_backward_out_cpu(
}
Tensor avg_pool2d_backward_cpu(
const Tensor& gradOutput,
const Tensor& gradOutput_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
@ -216,10 +420,10 @@ Tensor avg_pool2d_backward_cpu(
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
auto gradInput = at::empty({0}, input.options());
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
avg_pool2d_backward_out_cpu_template(
gradInput,
gradOutput,
gradOutput_,
input,
kernel_size,
stride,
@ -230,8 +434,5 @@ Tensor avg_pool2d_backward_cpu(
return gradInput;
}
DEFINE_DISPATCH(avg_pool2d_kernel);
DEFINE_DISPATCH(avg_pool2d_backward_kernel);
} // at::native
} // at

View File

@ -15,12 +15,6 @@ using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& g
DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
// averge pooling has same signature for forward and backward
using avg_pool2d_fn = void(*)(Tensor& output, const Tensor& input, int kW, int kH,
int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_backward_kernel);
namespace {
template <typename dest_t, typename src_t>

View File

@ -1,416 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/cpu/vec256/vec256.h>
#include <ATen/native/Pool.h>
#include <ATen/native/cpu/utils.h>
namespace at { namespace native {
namespace {
template <typename scalar_t>
void cpu_avg_pool(
Tensor& output_,
const Tensor& input_,
int kW, int kH,
int dW, int dH,
int padW, int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override) {
auto input = input_.contiguous();
auto output = output_.contiguous();
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
int64_t numel = output.numel();
int64_t ndim = input.ndimension();
// treat batch size and channels as one dimension
int64_t channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
int64_t input_height = input.size(-2);
int64_t input_width = input.size(-1);
int64_t output_height = output.size(-2);
int64_t output_width = output.size(-1);
// parallel on dim N, C, H, W
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
int64_t c = 0;
int64_t oh = 0;
int64_t ow = 0;
data_index_init(begin, c, channels, oh, output_height, ow, output_width);
for (int64_t i = begin; i < end; i++) {
output_data[i] = static_cast<scalar_t>(0);
// local pointers
scalar_t* input_ptr = input_data + c * input_height * input_width;
// compute the mean of the input image...
int64_t ih0 = oh * dH - padH;
int64_t iw0 = ow * dW - padW;
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
ih0 = std::max(ih0, (int64_t) 0);
iw0 = std::max(iw0, (int64_t) 0);
ih1 = std::min(ih1, input_height);
iw1 = std::min(iw1, input_width);
if (ih0 >= ih1 || iw0 >= iw1) {
// move on to next output index
data_index_step(c, channels, oh, output_height, ow, output_width);
continue;
}
scalar_t sum = 0;
int64_t divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
divide_factor = (ih1 - ih0) * (iw1 - iw0);
}
}
for (int64_t ih = ih0; ih < ih1; ih++) {
for (int64_t iw = iw0; iw < iw1; iw++) {
sum += input_ptr[ih * input_width + iw];
}
}
output_data[i] += sum / divide_factor;
// move on to next output index
data_index_step(c, channels, oh, output_height, ow, output_width);
}
});
if (!output_.is_contiguous()) {
output_.copy_(output);
}
}
template <typename scalar_t>
void cpu_avg_pool_channels_last(
Tensor& output_,
const Tensor& input_,
int kW, int kH,
int dW, int dH,
int padW, int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override) {
TORCH_CHECK(input_.ndimension() == 4,
"average pooling with channels last format supports tensors with 4 dims");
auto memory_format = at::MemoryFormat::ChannelsLast;
auto input = input_.contiguous(memory_format);
auto output = output_.contiguous(memory_format);
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
int64_t nbatch = input.size(0);
int64_t channels = input.size(1);
int64_t input_height = input.size(2);
int64_t input_width = input.size(3);
int64_t output_height = output.size(2);
int64_t output_width = output.size(3);
using Vec = vec256::Vec256<scalar_t>;
// parallel on dim N, H, W
at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
int64_t n = 0;
int64_t oh = 0;
int64_t ow = 0;
data_index_init(begin, n, nbatch, oh, output_height, ow, output_width);
int64_t size = channels;
int64_t len = size - (size % Vec::size());
for (int64_t i = begin; i < end; i++) {
// compute the mean of the input image...
int64_t ih0 = oh * dH - padH;
int64_t iw0 = ow * dW - padW;
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
ih0 = std::max(ih0, (int64_t) 0);
iw0 = std::max(iw0, (int64_t) 0);
ih1 = std::min(ih1, input_height);
iw1 = std::min(iw1, input_width);
int64_t divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
divide_factor = (ih1 - ih0) * (iw1 - iw0);
}
}
scalar_t* out = output_data + i * channels;
// Pass I: zero the out lane
int64_t d1 = 0;
for (; d1 < len; d1 += Vec::size()) {
Vec out_vec = Vec(scalar_t(0));
out_vec.store(out + d1);
}
for (; d1 < size; d1++) {
out[d1] = scalar_t(0);
}
if (ih0 >= ih1 || iw0 >= iw1) {
// move on to next output index
data_index_step(n, nbatch, oh, output_height, ow, output_width);
continue;
}
// Pass II: compute local sum
for (int64_t ih = ih0; ih < ih1; ih++) {
for (int64_t iw = iw0; iw < iw1; iw++) {
scalar_t* in = input_data + n * input_height * input_width * channels +
ih * input_width * channels + iw * channels;
int64_t d2 = 0;
for (; d2 < len; d2 += Vec::size()) {
Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2);
out_vec.store(out + d2);
}
for (; d2 < size; d2++) {
out[d2] += in[d2];
}
}
}
// Pass III: compute local average
int64_t d3 = 0;
for (; d3 < len; d3 += Vec::size()) {
Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(divide_factor));
out_vec.store(out + d3);
}
for (; d3 < size; d3++) {
out[d3] = out[d3] / divide_factor;
}
// move on to next output index
data_index_step(n, nbatch, oh, output_height, ow, output_width);
}
});
if (!output_.is_contiguous(memory_format)) {
output_.copy_(output);
}
}
template <typename scalar_t>
void cpu_avg_pool_backward(
Tensor& grad_input_,
const Tensor& grad_output_,
int kW, int kH,
int dW, int dH,
int padW, int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override) {
auto grad_output = grad_output_.contiguous();
auto grad_input = grad_input_.contiguous();
auto grad_output_data = grad_output.data_ptr<scalar_t>();
auto grad_input_data = grad_input.data_ptr<scalar_t>();
int64_t ndim = grad_output.ndimension();
// treat batch size and channels as one dimension
int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
int64_t input_height = grad_input.size(-2);
int64_t input_width = grad_input.size(-1);
int64_t output_height = grad_output.size(-2);
int64_t output_width = grad_output.size(-1);
// parallel on dim of N, C
at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
for (int64_t c = begin; c < end; c++) {
scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
for (int64_t oh = 0; oh < output_height; oh++) {
for (int64_t ow = 0; ow < output_width; ow++) {
int64_t ih0 = oh * dH - padH;
int64_t iw0 = ow * dW - padW;
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
ih0 = std::max(ih0, (int64_t) 0);
iw0 = std::max(iw0, (int64_t) 0);
ih1 = std::min(ih1, input_height);
iw1 = std::min(iw1, input_width);
int64_t divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
divide_factor = (ih1 - ih0) * (iw1 - iw0);
}
}
scalar_t grad_delta = grad_output_ptr[oh * output_width + ow] / divide_factor;
for (int64_t ih = ih0; ih < ih1; ih++) {
for (int64_t iw = iw0; iw < iw1; iw++) {
grad_input_ptr[ih * input_width + iw] += grad_delta;
}
}
}
}
}
});
if (!grad_input_.is_contiguous()) {
grad_input_.copy_(grad_input);
}
}
template <typename scalar_t>
void cpu_avg_pool_backward_channels_last(
Tensor& grad_input_,
const Tensor& grad_output_,
int kW, int kH,
int dW, int dH,
int padW, int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override) {
auto memory_format = at::MemoryFormat::ChannelsLast;
auto grad_input = grad_input_.contiguous(memory_format);
auto grad_output = grad_output_.contiguous(memory_format);
auto grad_input_data = grad_input.data_ptr<scalar_t>();
auto grad_output_data = grad_output.data_ptr<scalar_t>();
int64_t nbatch = grad_input.size(0);
int64_t channels = grad_input.size(1);
int64_t input_height = grad_input.size(2);
int64_t input_width = grad_input.size(3);
int64_t output_height = grad_output.size(2);
int64_t output_width = grad_output.size(3);
using Vec = vec256::Vec256<scalar_t>;
// parallel on dim N
at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
for (int64_t n = begin; n < end; n++) {
scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
for (int64_t oh = 0; oh < output_height; oh++) {
for (int64_t ow = 0; ow < output_width; ow++) {
int64_t ih0 = oh * dH - padH;
int64_t iw0 = ow * dW - padW;
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
ih0 = std::max(ih0, (int64_t) 0);
iw0 = std::max(iw0, (int64_t) 0);
ih1 = std::min(ih1, input_height);
iw1 = std::min(iw1, input_width);
int64_t divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
divide_factor = (ih1 - ih0) * (iw1 - iw0);
}
}
scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
int64_t size = channels;
int64_t len = size - (size % Vec::size());
for (int64_t ih = ih0; ih < ih1; ih++) {
for (int64_t iw = iw0; iw < iw1; iw++) {
scalar_t* gin = grad_input_ptr + ih * input_width * channels + iw * channels;
int64_t d = 0;
for (; d < len; d += Vec::size()) {
Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(divide_factor));
gin_vec.store(gin + d);
}
for (; d < size; d++) {
gin[d] += gout[d] / divide_factor;
}
}
}
}
}
}
});
if (!grad_input_.is_contiguous(memory_format)) {
grad_input_.copy_(grad_input);
}
}
void avg_pool2d_kernel_impl(
Tensor& output,
const Tensor& input,
int kW, int kH,
int dW, int dH,
int padW, int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override) {
switch (input.suggest_memory_format()) {
case at::MemoryFormat::Contiguous: {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "avg_pool2d", [&] {
cpu_avg_pool<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
});
break;
}
case at::MemoryFormat::ChannelsLast: {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "avg_pool2d_channels_last", [&] {
cpu_avg_pool_channels_last<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
});
break;
}
default:
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
}
void avg_pool2d_backward_kernel_impl(
Tensor& grad_input,
const Tensor& grad_output,
int kW, int kH,
int dW, int dH,
int padW, int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override) {
switch (grad_output.suggest_memory_format()) {
case at::MemoryFormat::Contiguous: {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, grad_output.scalar_type(), "avg_pool2d_backward", [&] {
cpu_avg_pool_backward<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
});
break;
}
case at::MemoryFormat::ChannelsLast: {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, grad_output.scalar_type(), "avg_pool2d_backward_channels_last", [&] {
cpu_avg_pool_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
});
break;
}
default:
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
}
} // anonymous namespace
REGISTER_DISPATCH(avg_pool2d_kernel, &avg_pool2d_kernel_impl);
REGISTER_DISPATCH(avg_pool2d_backward_kernel, &avg_pool2d_backward_kernel_impl);
}} // at::native

View File

@ -13156,8 +13156,7 @@ class TestNNDeviceType(NNTestCase):
with self.assertRaisesRegex(RuntimeError, "not implemented"):
output = module(input)
@onlyOnCPUAndCUDA
@dtypes(torch.float, torch.double)
@onlyCUDA
@dtypesIfCUDA(torch.half, torch.float, torch.double)
def test_avg_pool2d_nhwc(self, device, dtype):
def helper(n, c, h, w, kernel_size, stride=None,

View File

@ -792,7 +792,6 @@ aten_cpu_source_list = sorted(aten_cpu_source_non_codegen_list + aten_cpu_source
# ${cpu_kernel_cpp} in aten/src/ATen/CMakeLists.txt.
aten_native_source_codegen_list = [
"aten/src/ATen/native/cpu/Activation.cpp",
"aten/src/ATen/native/cpu/AvgPoolKernel.cpp",
"aten/src/ATen/native/cpu/BinaryOpsKernel.cpp",
"aten/src/ATen/native/cpu/BlasKernel.cpp",
"aten/src/ATen/native/cpu/CatKernel.cpp",