mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert D25399466: add channels last support for AvgPool2d on CPU
Test Plan: revert-hammer
Differential Revision:
D25399466 (8ac0917cc7)
Original commit changeset: 9477b0c281c0
fbshipit-source-id: e0245f0e390f5eca228445fd82d6e5142a827abc
This commit is contained in:
committed by
Facebook GitHub Bot
parent
0caec739a3
commit
49a8942a77
@ -1,6 +1,8 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
namespace at {
|
||||
@ -8,9 +10,98 @@ namespace native {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t>
|
||||
static void avg_pool2d_out_frame(
|
||||
scalar_t *input_data,
|
||||
scalar_t *output_data,
|
||||
int64_t nbatch,
|
||||
int64_t nInputPlane,
|
||||
int64_t inputWidth,
|
||||
int64_t inputHeight,
|
||||
int64_t outputWidth,
|
||||
int64_t outputHeight,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override)
|
||||
{
|
||||
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t p;
|
||||
for(p = 0; p < nbatch; p++)
|
||||
{
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t xx, yy;
|
||||
/* For all output pixels... */
|
||||
scalar_t *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
|
||||
const scalar_t *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t i;
|
||||
for(i = 0; i < outputWidth*outputHeight; i++)
|
||||
ptr_output[i] = 0;
|
||||
|
||||
for(yy = 0; yy < outputHeight; yy++)
|
||||
{
|
||||
for(xx = 0; xx < outputWidth; xx++)
|
||||
{
|
||||
/* Compute the mean of the input image... */
|
||||
int64_t hstart = yy * dH - padH;
|
||||
int64_t wstart = xx * dW - padW;
|
||||
int64_t hend = std::min(hstart + kH, inputHeight + padH);
|
||||
int64_t wend = std::min(wstart + kW, inputWidth + padW);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = std::max(hstart, (int64_t) 0);
|
||||
wstart = std::max(wstart, (int64_t) 0);
|
||||
hend = std::min(hend, inputHeight);
|
||||
wend = std::min(wend, inputWidth);
|
||||
|
||||
if (hstart >= hend || wstart >= wend) {
|
||||
++ptr_output;
|
||||
continue;
|
||||
}
|
||||
|
||||
scalar_t sum = 0;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int divide_factor;
|
||||
if (divisor_override.has_value()) {
|
||||
divide_factor = divisor_override.value();
|
||||
} else {
|
||||
if(count_include_pad) {
|
||||
divide_factor = pool_size;
|
||||
} else {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
divide_factor = (hend - hstart) * (wend - wstart);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t kx, ky;
|
||||
|
||||
for(ky = hstart; ky < hend; ky++)
|
||||
{
|
||||
for(kx = wstart; kx < wend; kx++)
|
||||
sum += ptr_input[ky*inputWidth + kx];
|
||||
}
|
||||
/* Update output */
|
||||
*ptr_output++ += sum/divide_factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void avg_pool2d_out_cpu_template(
|
||||
Tensor &output,
|
||||
const Tensor &input,
|
||||
const Tensor &input_,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
@ -38,41 +129,140 @@ void avg_pool2d_out_cpu_template(
|
||||
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
|
||||
"divisor must be not zero");
|
||||
|
||||
TORCH_CHECK(input.dtype() == output.dtype(),
|
||||
"expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());
|
||||
|
||||
/* sizes */
|
||||
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
|
||||
const int64_t nInputPlane = input.size(-3);
|
||||
const int64_t inputHeight = input.size(-2);
|
||||
const int64_t inputWidth = input.size(-1);
|
||||
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
|
||||
const int64_t nInputPlane = input_.size(-3);
|
||||
const int64_t inputHeight = input_.size(-2);
|
||||
const int64_t inputWidth = input_.size(-1);
|
||||
|
||||
const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
|
||||
const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
|
||||
|
||||
pool2d_shape_check(
|
||||
input,
|
||||
input_,
|
||||
kH, kW, dH, dW, padH, padW, 1, 1,
|
||||
nInputPlane,
|
||||
inputHeight, inputWidth,
|
||||
outputHeight, outputWidth, input.suggest_memory_format());
|
||||
outputHeight, outputWidth, input_.suggest_memory_format());
|
||||
|
||||
if (input.ndimension() == 3) {
|
||||
if (input_.ndimension() == 3) {
|
||||
output.resize_({nInputPlane, outputHeight, outputWidth});
|
||||
}
|
||||
else {
|
||||
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
|
||||
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
|
||||
}
|
||||
|
||||
avg_pool2d_kernel(
|
||||
kCPU, output, input,
|
||||
kW, kH, dW, dH, padW, padH,
|
||||
count_include_pad, divisor_override);
|
||||
TORCH_CHECK(output.is_contiguous(), "avg_pool2d: output must be contiguous");
|
||||
|
||||
Tensor input = input_.contiguous();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
|
||||
"avg_pool2d_out_frame",
|
||||
[&] {
|
||||
scalar_t *input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
|
||||
avg_pool2d_out_frame(
|
||||
input_data,
|
||||
output_data,
|
||||
nbatch,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH,
|
||||
dW, dH,
|
||||
padW, padH,
|
||||
count_include_pad,
|
||||
divisor_override);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void avg_pool2d_backward_out_frame(
|
||||
scalar_t *gradInput_data,
|
||||
scalar_t *gradOutput_data,
|
||||
int64_t nbatch,
|
||||
int64_t nInputPlane,
|
||||
int64_t inputWidth,
|
||||
int64_t inputHeight,
|
||||
int64_t outputWidth,
|
||||
int64_t outputHeight,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override)
|
||||
{
|
||||
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t p;
|
||||
for(p = 0; p < nbatch; p++)
|
||||
{
|
||||
const scalar_t *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t xx, yy;
|
||||
|
||||
scalar_t* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
|
||||
scalar_t *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t i;
|
||||
for(i=0; i<inputWidth*inputHeight; i++)
|
||||
ptr_gi[i] = 0.0;
|
||||
|
||||
for(yy = 0; yy < outputHeight; yy++)
|
||||
{
|
||||
for(xx = 0; xx < outputWidth; xx++)
|
||||
{
|
||||
int64_t hstart = yy * dH - padH;
|
||||
int64_t wstart = xx * dW - padW;
|
||||
int64_t hend = std::min(hstart + kH, inputHeight + padH);
|
||||
int64_t wend = std::min(wstart + kW, inputWidth + padW);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = std::max(hstart, (int64_t) 0);
|
||||
wstart = std::max(wstart, (int64_t) 0);
|
||||
hend = std::min(hend, inputHeight);
|
||||
wend = std::min(wend, inputWidth);
|
||||
|
||||
scalar_t z = *ptr_gradOutput++;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int divide_factor;
|
||||
if (divisor_override.has_value()) {
|
||||
divide_factor = divisor_override.value();
|
||||
} else {
|
||||
if(count_include_pad) {
|
||||
divide_factor = pool_size;
|
||||
} else {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
divide_factor = (hend - hstart) * (wend - wstart);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t kx, ky;
|
||||
for(ky = hstart ; ky < hend; ky++)
|
||||
{
|
||||
for(kx = wstart; kx < wend; kx++)
|
||||
ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Tensor& avg_pool2d_backward_out_cpu_template(
|
||||
Tensor& gradInput,
|
||||
const Tensor& gradOutput,
|
||||
const Tensor& gradOutput_,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
@ -102,11 +292,6 @@ Tensor& avg_pool2d_backward_out_cpu_template(
|
||||
|
||||
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0, "divisor must be not zero");
|
||||
|
||||
TORCH_CHECK(input.dtype() == gradOutput.dtype(),
|
||||
"expected dtype ", input.dtype(), " for `gradOutput` but got dtype ", gradOutput.dtype());
|
||||
TORCH_CHECK(input.dtype() == gradInput.dtype(),
|
||||
"expected dtype ", input.dtype(), " for `gradInput` but got dtype ", gradInput.dtype());
|
||||
|
||||
/* sizes */
|
||||
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
|
||||
const int64_t nInputPlane = input.size(-3); // number of channels (or colors)
|
||||
@ -117,7 +302,7 @@ Tensor& avg_pool2d_backward_out_cpu_template(
|
||||
|
||||
avg_pool2d_backward_shape_check(
|
||||
input,
|
||||
gradOutput,
|
||||
gradOutput_,
|
||||
nbatch,
|
||||
kH, kW, dH, dW, padH, padW,
|
||||
nInputPlane,
|
||||
@ -125,14 +310,34 @@ Tensor& avg_pool2d_backward_out_cpu_template(
|
||||
outputHeight, outputWidth,
|
||||
input.suggest_memory_format());
|
||||
|
||||
/* resize */
|
||||
gradInput.resize_(input.sizes(), input.suggest_memory_format());
|
||||
gradInput.zero_();
|
||||
/* get contiguous gradOutput */
|
||||
const Tensor gradOutput = gradOutput_.contiguous();
|
||||
|
||||
avg_pool2d_backward_kernel(
|
||||
kCPU, gradInput, gradOutput,
|
||||
kW, kH, dW, dH, padW, padH,
|
||||
count_include_pad, divisor_override);
|
||||
/* resize */
|
||||
gradInput.resize_as_(input);
|
||||
gradInput.zero_();
|
||||
TORCH_CHECK(gradInput.is_contiguous(), "gradInput must be contiguous");
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
|
||||
"avg_pool2d_backward_out_frame",
|
||||
[&] {
|
||||
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||
|
||||
avg_pool2d_backward_out_frame(
|
||||
gradInput_data,
|
||||
gradOutput_data,
|
||||
nbatch,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH,
|
||||
dW, dH,
|
||||
padW, padH,
|
||||
count_include_pad,
|
||||
divisor_override);
|
||||
}
|
||||
);
|
||||
|
||||
return gradInput;
|
||||
}
|
||||
@ -182,8 +387,7 @@ Tensor avg_pool2d_cpu(
|
||||
return output;
|
||||
}
|
||||
|
||||
Tensor& avg_pool2d_backward_out_cpu(
|
||||
const Tensor& gradOutput,
|
||||
Tensor& avg_pool2d_backward_out_cpu(const Tensor& gradOutput_,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
@ -195,7 +399,7 @@ Tensor& avg_pool2d_backward_out_cpu(
|
||||
{
|
||||
avg_pool2d_backward_out_cpu_template(
|
||||
gradInput,
|
||||
gradOutput,
|
||||
gradOutput_,
|
||||
input,
|
||||
kernel_size,
|
||||
stride,
|
||||
@ -207,7 +411,7 @@ Tensor& avg_pool2d_backward_out_cpu(
|
||||
}
|
||||
|
||||
Tensor avg_pool2d_backward_cpu(
|
||||
const Tensor& gradOutput,
|
||||
const Tensor& gradOutput_,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
@ -216,10 +420,10 @@ Tensor avg_pool2d_backward_cpu(
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override)
|
||||
{
|
||||
auto gradInput = at::empty({0}, input.options());
|
||||
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
avg_pool2d_backward_out_cpu_template(
|
||||
gradInput,
|
||||
gradOutput,
|
||||
gradOutput_,
|
||||
input,
|
||||
kernel_size,
|
||||
stride,
|
||||
@ -230,8 +434,5 @@ Tensor avg_pool2d_backward_cpu(
|
||||
return gradInput;
|
||||
}
|
||||
|
||||
DEFINE_DISPATCH(avg_pool2d_kernel);
|
||||
DEFINE_DISPATCH(avg_pool2d_backward_kernel);
|
||||
|
||||
} // at::native
|
||||
} // at
|
||||
|
||||
@ -15,12 +15,6 @@ using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& g
|
||||
DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
|
||||
DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
|
||||
|
||||
// averge pooling has same signature for forward and backward
|
||||
using avg_pool2d_fn = void(*)(Tensor& output, const Tensor& input, int kW, int kH,
|
||||
int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
|
||||
DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
|
||||
DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_backward_kernel);
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename dest_t, typename src_t>
|
||||
|
||||
@ -1,416 +0,0 @@
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/cpu/vec256/vec256.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_avg_pool(
|
||||
Tensor& output_,
|
||||
const Tensor& input_,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override) {
|
||||
auto input = input_.contiguous();
|
||||
auto output = output_.contiguous();
|
||||
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
|
||||
int64_t numel = output.numel();
|
||||
int64_t ndim = input.ndimension();
|
||||
// treat batch size and channels as one dimension
|
||||
int64_t channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
|
||||
int64_t input_height = input.size(-2);
|
||||
int64_t input_width = input.size(-1);
|
||||
int64_t output_height = output.size(-2);
|
||||
int64_t output_width = output.size(-1);
|
||||
|
||||
// parallel on dim N, C, H, W
|
||||
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t c = 0;
|
||||
int64_t oh = 0;
|
||||
int64_t ow = 0;
|
||||
data_index_init(begin, c, channels, oh, output_height, ow, output_width);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
output_data[i] = static_cast<scalar_t>(0);
|
||||
|
||||
// local pointers
|
||||
scalar_t* input_ptr = input_data + c * input_height * input_width;
|
||||
|
||||
// compute the mean of the input image...
|
||||
int64_t ih0 = oh * dH - padH;
|
||||
int64_t iw0 = ow * dW - padW;
|
||||
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
|
||||
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
|
||||
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
|
||||
ih0 = std::max(ih0, (int64_t) 0);
|
||||
iw0 = std::max(iw0, (int64_t) 0);
|
||||
ih1 = std::min(ih1, input_height);
|
||||
iw1 = std::min(iw1, input_width);
|
||||
|
||||
if (ih0 >= ih1 || iw0 >= iw1) {
|
||||
// move on to next output index
|
||||
data_index_step(c, channels, oh, output_height, ow, output_width);
|
||||
continue;
|
||||
}
|
||||
|
||||
scalar_t sum = 0;
|
||||
|
||||
int64_t divide_factor;
|
||||
if (divisor_override.has_value()) {
|
||||
divide_factor = divisor_override.value();
|
||||
} else {
|
||||
if(count_include_pad) {
|
||||
divide_factor = pool_size;
|
||||
} else {
|
||||
divide_factor = (ih1 - ih0) * (iw1 - iw0);
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t ih = ih0; ih < ih1; ih++) {
|
||||
for (int64_t iw = iw0; iw < iw1; iw++) {
|
||||
sum += input_ptr[ih * input_width + iw];
|
||||
}
|
||||
}
|
||||
output_data[i] += sum / divide_factor;
|
||||
|
||||
// move on to next output index
|
||||
data_index_step(c, channels, oh, output_height, ow, output_width);
|
||||
}
|
||||
});
|
||||
|
||||
if (!output_.is_contiguous()) {
|
||||
output_.copy_(output);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_avg_pool_channels_last(
|
||||
Tensor& output_,
|
||||
const Tensor& input_,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override) {
|
||||
TORCH_CHECK(input_.ndimension() == 4,
|
||||
"average pooling with channels last format supports tensors with 4 dims");
|
||||
auto memory_format = at::MemoryFormat::ChannelsLast;
|
||||
auto input = input_.contiguous(memory_format);
|
||||
auto output = output_.contiguous(memory_format);
|
||||
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
|
||||
int64_t nbatch = input.size(0);
|
||||
int64_t channels = input.size(1);
|
||||
int64_t input_height = input.size(2);
|
||||
int64_t input_width = input.size(3);
|
||||
int64_t output_height = output.size(2);
|
||||
int64_t output_width = output.size(3);
|
||||
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
// parallel on dim N, H, W
|
||||
at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t n = 0;
|
||||
int64_t oh = 0;
|
||||
int64_t ow = 0;
|
||||
data_index_init(begin, n, nbatch, oh, output_height, ow, output_width);
|
||||
|
||||
int64_t size = channels;
|
||||
int64_t len = size - (size % Vec::size());
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
// compute the mean of the input image...
|
||||
int64_t ih0 = oh * dH - padH;
|
||||
int64_t iw0 = ow * dW - padW;
|
||||
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
|
||||
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
|
||||
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
|
||||
ih0 = std::max(ih0, (int64_t) 0);
|
||||
iw0 = std::max(iw0, (int64_t) 0);
|
||||
ih1 = std::min(ih1, input_height);
|
||||
iw1 = std::min(iw1, input_width);
|
||||
|
||||
int64_t divide_factor;
|
||||
if (divisor_override.has_value()) {
|
||||
divide_factor = divisor_override.value();
|
||||
} else {
|
||||
if(count_include_pad) {
|
||||
divide_factor = pool_size;
|
||||
} else {
|
||||
divide_factor = (ih1 - ih0) * (iw1 - iw0);
|
||||
}
|
||||
}
|
||||
|
||||
scalar_t* out = output_data + i * channels;
|
||||
|
||||
// Pass I: zero the out lane
|
||||
int64_t d1 = 0;
|
||||
for (; d1 < len; d1 += Vec::size()) {
|
||||
Vec out_vec = Vec(scalar_t(0));
|
||||
out_vec.store(out + d1);
|
||||
}
|
||||
for (; d1 < size; d1++) {
|
||||
out[d1] = scalar_t(0);
|
||||
}
|
||||
|
||||
if (ih0 >= ih1 || iw0 >= iw1) {
|
||||
// move on to next output index
|
||||
data_index_step(n, nbatch, oh, output_height, ow, output_width);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pass II: compute local sum
|
||||
for (int64_t ih = ih0; ih < ih1; ih++) {
|
||||
for (int64_t iw = iw0; iw < iw1; iw++) {
|
||||
scalar_t* in = input_data + n * input_height * input_width * channels +
|
||||
ih * input_width * channels + iw * channels;
|
||||
|
||||
int64_t d2 = 0;
|
||||
for (; d2 < len; d2 += Vec::size()) {
|
||||
Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2);
|
||||
out_vec.store(out + d2);
|
||||
}
|
||||
for (; d2 < size; d2++) {
|
||||
out[d2] += in[d2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass III: compute local average
|
||||
int64_t d3 = 0;
|
||||
for (; d3 < len; d3 += Vec::size()) {
|
||||
Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(divide_factor));
|
||||
out_vec.store(out + d3);
|
||||
}
|
||||
for (; d3 < size; d3++) {
|
||||
out[d3] = out[d3] / divide_factor;
|
||||
}
|
||||
|
||||
// move on to next output index
|
||||
data_index_step(n, nbatch, oh, output_height, ow, output_width);
|
||||
}
|
||||
});
|
||||
|
||||
if (!output_.is_contiguous(memory_format)) {
|
||||
output_.copy_(output);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_avg_pool_backward(
|
||||
Tensor& grad_input_,
|
||||
const Tensor& grad_output_,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override) {
|
||||
auto grad_output = grad_output_.contiguous();
|
||||
auto grad_input = grad_input_.contiguous();
|
||||
|
||||
auto grad_output_data = grad_output.data_ptr<scalar_t>();
|
||||
auto grad_input_data = grad_input.data_ptr<scalar_t>();
|
||||
|
||||
int64_t ndim = grad_output.ndimension();
|
||||
// treat batch size and channels as one dimension
|
||||
int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
|
||||
int64_t input_height = grad_input.size(-2);
|
||||
int64_t input_width = grad_input.size(-1);
|
||||
int64_t output_height = grad_output.size(-2);
|
||||
int64_t output_width = grad_output.size(-1);
|
||||
|
||||
// parallel on dim of N, C
|
||||
at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
|
||||
for (int64_t c = begin; c < end; c++) {
|
||||
scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
|
||||
scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
|
||||
|
||||
for (int64_t oh = 0; oh < output_height; oh++) {
|
||||
for (int64_t ow = 0; ow < output_width; ow++) {
|
||||
int64_t ih0 = oh * dH - padH;
|
||||
int64_t iw0 = ow * dW - padW;
|
||||
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
|
||||
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
|
||||
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
|
||||
ih0 = std::max(ih0, (int64_t) 0);
|
||||
iw0 = std::max(iw0, (int64_t) 0);
|
||||
ih1 = std::min(ih1, input_height);
|
||||
iw1 = std::min(iw1, input_width);
|
||||
|
||||
int64_t divide_factor;
|
||||
if (divisor_override.has_value()) {
|
||||
divide_factor = divisor_override.value();
|
||||
} else {
|
||||
if(count_include_pad) {
|
||||
divide_factor = pool_size;
|
||||
} else {
|
||||
divide_factor = (ih1 - ih0) * (iw1 - iw0);
|
||||
}
|
||||
}
|
||||
|
||||
scalar_t grad_delta = grad_output_ptr[oh * output_width + ow] / divide_factor;
|
||||
for (int64_t ih = ih0; ih < ih1; ih++) {
|
||||
for (int64_t iw = iw0; iw < iw1; iw++) {
|
||||
grad_input_ptr[ih * input_width + iw] += grad_delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (!grad_input_.is_contiguous()) {
|
||||
grad_input_.copy_(grad_input);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_avg_pool_backward_channels_last(
|
||||
Tensor& grad_input_,
|
||||
const Tensor& grad_output_,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override) {
|
||||
auto memory_format = at::MemoryFormat::ChannelsLast;
|
||||
auto grad_input = grad_input_.contiguous(memory_format);
|
||||
auto grad_output = grad_output_.contiguous(memory_format);
|
||||
|
||||
auto grad_input_data = grad_input.data_ptr<scalar_t>();
|
||||
auto grad_output_data = grad_output.data_ptr<scalar_t>();
|
||||
|
||||
int64_t nbatch = grad_input.size(0);
|
||||
int64_t channels = grad_input.size(1);
|
||||
int64_t input_height = grad_input.size(2);
|
||||
int64_t input_width = grad_input.size(3);
|
||||
int64_t output_height = grad_output.size(2);
|
||||
int64_t output_width = grad_output.size(3);
|
||||
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
// parallel on dim N
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
|
||||
for (int64_t n = begin; n < end; n++) {
|
||||
scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
|
||||
scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
|
||||
|
||||
for (int64_t oh = 0; oh < output_height; oh++) {
|
||||
for (int64_t ow = 0; ow < output_width; ow++) {
|
||||
int64_t ih0 = oh * dH - padH;
|
||||
int64_t iw0 = ow * dW - padW;
|
||||
int64_t ih1 = std::min(ih0 + kH, input_height + padH);
|
||||
int64_t iw1 = std::min(iw0 + kW, input_width + padW);
|
||||
int64_t pool_size = (ih1 - ih0) * (iw1 - iw0);
|
||||
ih0 = std::max(ih0, (int64_t) 0);
|
||||
iw0 = std::max(iw0, (int64_t) 0);
|
||||
ih1 = std::min(ih1, input_height);
|
||||
iw1 = std::min(iw1, input_width);
|
||||
|
||||
int64_t divide_factor;
|
||||
if (divisor_override.has_value()) {
|
||||
divide_factor = divisor_override.value();
|
||||
} else {
|
||||
if(count_include_pad) {
|
||||
divide_factor = pool_size;
|
||||
} else {
|
||||
divide_factor = (ih1 - ih0) * (iw1 - iw0);
|
||||
}
|
||||
}
|
||||
|
||||
scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
|
||||
int64_t size = channels;
|
||||
int64_t len = size - (size % Vec::size());
|
||||
for (int64_t ih = ih0; ih < ih1; ih++) {
|
||||
for (int64_t iw = iw0; iw < iw1; iw++) {
|
||||
scalar_t* gin = grad_input_ptr + ih * input_width * channels + iw * channels;
|
||||
|
||||
int64_t d = 0;
|
||||
for (; d < len; d += Vec::size()) {
|
||||
Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(divide_factor));
|
||||
gin_vec.store(gin + d);
|
||||
}
|
||||
for (; d < size; d++) {
|
||||
gin[d] += gout[d] / divide_factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (!grad_input_.is_contiguous(memory_format)) {
|
||||
grad_input_.copy_(grad_input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void avg_pool2d_kernel_impl(
|
||||
Tensor& output,
|
||||
const Tensor& input,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override) {
|
||||
switch (input.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "avg_pool2d", [&] {
|
||||
cpu_avg_pool<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "avg_pool2d_channels_last", [&] {
|
||||
cpu_avg_pool_channels_last<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
void avg_pool2d_backward_kernel_impl(
|
||||
Tensor& grad_input,
|
||||
const Tensor& grad_output,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
bool count_include_pad,
|
||||
c10::optional<int64_t> divisor_override) {
|
||||
switch (grad_output.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, grad_output.scalar_type(), "avg_pool2d_backward", [&] {
|
||||
cpu_avg_pool_backward<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, grad_output.scalar_type(), "avg_pool2d_backward_channels_last", [&] {
|
||||
cpu_avg_pool_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
REGISTER_DISPATCH(avg_pool2d_kernel, &avg_pool2d_kernel_impl);
|
||||
REGISTER_DISPATCH(avg_pool2d_backward_kernel, &avg_pool2d_backward_kernel_impl);
|
||||
|
||||
}} // at::native
|
||||
@ -13156,8 +13156,7 @@ class TestNNDeviceType(NNTestCase):
|
||||
with self.assertRaisesRegex(RuntimeError, "not implemented"):
|
||||
output = module(input)
|
||||
|
||||
@onlyOnCPUAndCUDA
|
||||
@dtypes(torch.float, torch.double)
|
||||
@onlyCUDA
|
||||
@dtypesIfCUDA(torch.half, torch.float, torch.double)
|
||||
def test_avg_pool2d_nhwc(self, device, dtype):
|
||||
def helper(n, c, h, w, kernel_size, stride=None,
|
||||
|
||||
@ -792,7 +792,6 @@ aten_cpu_source_list = sorted(aten_cpu_source_non_codegen_list + aten_cpu_source
|
||||
# ${cpu_kernel_cpp} in aten/src/ATen/CMakeLists.txt.
|
||||
aten_native_source_codegen_list = [
|
||||
"aten/src/ATen/native/cpu/Activation.cpp",
|
||||
"aten/src/ATen/native/cpu/AvgPoolKernel.cpp",
|
||||
"aten/src/ATen/native/cpu/BinaryOpsKernel.cpp",
|
||||
"aten/src/ATen/native/cpu/BlasKernel.cpp",
|
||||
"aten/src/ATen/native/cpu/CatKernel.cpp",
|
||||
|
||||
Reference in New Issue
Block a user