Files
pytorch/caffe2/mobile/contrib/opengl/operators/GLPool.cc
ArutyunovG 8e91da4cb3 Windows shared build (#13550)
Summary:
Hi guys,

I'd like to build Caffe2 with more supported options in Windows with Microsoft Visual Studios.
This is the first pull request.
Running scripts/build_windows_shared.bat is able to build Caffe2 with both CMAKE_BUILD_TYPE=Debug and CMAKE_BUILD_TYPE=Release with Visual Studio 14 2015.
CUDA is 9.0, cudnn is 7.0.5, glog, gflags and lmdb are supported on my system.
Python is 3.5, Detectron works from python interface as well.
It was even possible to debug detectron code and step into caffe2_gpu.dll with pdbs built.

What is disappointing, that c10/experimental ops don't build with this Visual Studio generator, I added special option INCLUDE_EXPERIMENTAL_C10_OPS (default ON) to deal with it in build_windows_shared.bat.

After this pull request the next step is to add Visual Studio 2017 support in the script.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13550

Reviewed By: ezyang

Differential Revision: D13042597

Pulled By: orionr

fbshipit-source-id: f313f909f599cd582a1d000eff766eef3a9fc4fc
2018-11-16 12:16:28 -08:00

340 lines
10 KiB
C++

#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/timer.h"
#include "caffe2/operators/pool_op.h"
class GLPool : public GLFilter {
public:
typedef enum { AveragePool, MaxPool } PoolType;
struct point {
int x;
int y;
};
struct descriptor {
int channels;
point kernel_size;
point input_padding;
point input_stride;
point input_tile_size;
point output_tile_size;
};
binding* inputData;
binding* kernelSize;
binding* outputSize;
const descriptor geometry;
GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
: GLFilter(
"GLPool",
vertex_shader,
fragment_shader,
{
BINDING(inputData),
BINDING(kernelSize),
BINDING(outputSize),
},
{/* no uniform blocks */},
{/* no attributes */},
{{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
{"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
{"INPUT_PADDING_X", c10::to_string(_geometry.input_padding.x)},
{"INPUT_PADDING_Y", c10::to_string(_geometry.input_padding.y)},
{"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
{"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
{"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
{"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
{"OUTPUT_TILE_WIDTH",
c10::to_string(_geometry.output_tile_size.x)},
{"OUTPUT_TILE_HEIGHT",
c10::to_string(_geometry.output_tile_size.y)},
{"TILED_POOLING", c10::to_string(_tiling)},
{"MAX_POOL", c10::to_string(poolType == MaxPool)},
{"BOUNDS_CHECK_MODE", c10::to_string(1)}}),
geometry(_geometry) {}
~GLPool() {}
void pool(const GLImageVector<float16_t>& input_images,
const GLImageVector<float16_t>& output_images) {
for (int i = 0; i < input_images.size(); i++) {
auto input_image = input_images[i];
auto output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
run({{input_image->textures[is], inputData}},
{output_image->textures[is]},
[&]() {
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
},
output_image->texture_width,
output_image->texture_height);
}
}
}
private:
/*
* Computes BOUNDS_CHECK_MODE for the convolution parameters.
*
* @retval 0 if bounds check can be skipped
* @retval non-zero if bounds check can not be skipped
*/
inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
if (tiling) {
return 1;
}
if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
(geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
return 0;
} else {
return 1;
}
}
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLPool::fragment_shader = R"GLSL(#version 300 es
#define TILED_POOLING $(TILED_POOLING)
#define MAX_POOL $(MAX_POOL)
// tiling
#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
#define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
uniform ivec2 kernelSize;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
#if BOUNDS_CHECK_MODE == 0
#define IN_BOUNDS(p, p0, p1) (true)
#else
#define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
#endif
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
const float MIN_FLOAT = -exp2(14.0);
#if TILED_POOLING
const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
// tiled pooling
#if MAX_POOL
#define POOL { \
pool = vec4(MIN_FLOAT); \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = tileCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
pool = max(pool, data); \
} \
} \
} \
}
#else
#define POOL { \
int count = 0; \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = tileCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
pool += data;\
count += 1; \
} \
} \
} \
pool = pool / float(count); \
}
#endif // MAX_POOL
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
tileCoord = input_stride * tileCoord - input_padding;
ivec2 inputTileOffset = tile * inputTileSize;
#if MAX_POOL
vec4 pool = vec4(0);
#else
highp vec4 pool = vec4(0);
#endif
POOL;
outputData = TEXTURE_STORE(pool);
}
#else
// no tiling
#if MAX_POOL
#define POOL { \
pool = vec4(MIN_FLOAT); \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = texelCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputSize) { \
vec4 data = TEXTURE_LOAD(inputData, idx); \
pool = max(pool, data); \
} \
} \
} \
}
#else
#define POOL { \
int count = 0; \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = texelCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputSize) { \
vec4 data = TEXTURE_LOAD(inputData, idx); \
pool += data; \
count += 1; \
} \
} \
} \
pool = pool / float(count); \
}
#endif // MAX_POOL
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
#if MAX_POOL
vec4 pool = vec4(0);
#else
highp vec4 pool = vec4(0);
#endif
POOL;
outputData = TEXTURE_STORE(pool);
}
#endif // TILED_POOLING
)GLSL";
namespace caffe2 {
template <typename OPBase>
static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
Tensor<CPUContext> input, output;
input.Resize(1, 1, H, W);
op->SetOutputSize(input, &output, 1);
CAFFE_ENFORCE_EQ(output.ndim(), 4);
*OH = output.dim(2);
*OW = output.dim(3);
}
template <typename T, GLPool::PoolType poolType>
class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
public:
GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
"Pooling op does not support dilation right now.");
if (!global_pooling_) {
CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
pad_r() < kernel_w(),
"Pad should be smaller than kernel.");
}
}
bool RunOnDeviceWithOrderNCHW() override {
const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
int output_height;
int output_width;
const int output_channels = input_channels;
computeOutputHW(this, input_height, input_width, &output_height, &output_width);
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
GLPool::descriptor geometry{input_channels,
{kernel_w(), kernel_h()},
{pad_l(), pad_t()},
{stride_w(), stride_h()},
{input_width, input_height},
{output_height, output_width}};
if (!glPool_) {
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
<< output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
<< " Tiling: " << input_tile_x << "X" << input_tile_y;
glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
}
glPool_->pool(input, *output);
OperatorBase::Outputs()[0]->Reset(output);
return true;
}
private:
std::unique_ptr<GLPool> glPool_;
};
namespace {
REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
}; // namespace
}; // namespace caffe2