caffe2 mobile opengl (#15322)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15322

caffe2 mobile opengl code is not used, deleting it to reduce complications when we perform other changes

Reviewed By: Maratyszcza

Differential Revision: D13499943

fbshipit-source-id: 6479f6b9f50f08b5ae28f8f0bc4a1c4fc3f3c3c2
This commit is contained in:
Jerry Zhang
2018-12-18 08:17:56 -08:00
committed by Facebook Github Bot
parent 54d8ce94ee
commit 12cf5178aa
70 changed files with 1 additions and 11807 deletions

View File

@ -92,7 +92,6 @@ option(USE_LEVELDB "Use LEVELDB" ON)
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
option(USE_LMDB "Use LMDB" ON)
option(USE_METAL "Use Metal for iOS build" ON)
option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
option(USE_NCCL "Use NCCL" ON)
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)

View File

@ -1,8 +1,4 @@
add_subdirectory(ios)
# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
# add_subdirectory(opengl)
endif()
if (USE_ACL)
# add_subdirectory(arm-compute)
endif()

View File

@ -1,11 +0,0 @@
add_subdirectory(core)
add_subdirectory(operators)
if (ANDROID)
add_subdirectory(android)
endif()
if (IOS)
add_subdirectory(ios)
endif()
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)

View File

@ -1,156 +0,0 @@
#include "AndroidGLContext.h"
#include "caffe2/core/logging.h"
#include "gl3stub.h"
#include <regex>
namespace {
static const std::unordered_map<std::string, GL_Renderer>& renderer_map() {
static std::unordered_map<std::string, GL_Renderer> m = {
{"Adreno", Adreno},
{"Mali", Mali},
{"NVIDIA", Tegra} /*, {"PowerVR", PowerVR} */};
return m;
}
} // namespace
EGLContext AndroidGLContext::create_opengl_thread_context() {
EGLSurface surface = EGL_NO_SURFACE;
EGLContext context = EGL_NO_CONTEXT;
EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (display == EGL_NO_DISPLAY) {
// We failed to get a display
CAFFE_THROW("Problem with OpenGL context");
return context;
}
EGLint major;
EGLint minor;
eglInitialize(display, &major, &minor);
const EGLint configAttr[] = {EGL_RENDERABLE_TYPE,
EGL_OPENGL_ES2_BIT,
EGL_SURFACE_TYPE,
EGL_PBUFFER_BIT, // we create a pixelbuffer surface
EGL_NONE};
EGLint numConfig;
EGLConfig eglConfig;
if (!eglChooseConfig(display, configAttr, &eglConfig, 1, &numConfig)) {
// We failed to find a suitable config
eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
eglTerminate(display);
display = EGL_NO_DISPLAY;
CAFFE_THROW("Problem with OpenGL context");
return context;
}
const EGLint ctxAttr[] = {EGL_CONTEXT_CLIENT_VERSION,
2, // very important!
EGL_NONE};
// Create an EGL context based on the chosen configuration.
context = eglCreateContext(display, eglConfig, EGL_NO_CONTEXT, ctxAttr);
// We need a surface. For most mixed JNI/Java based apps it is suggested
// that we pass a Java surface through JNI and extract the surface
// Pure NDK apps get passed the android_app structure which includes a surface
// We want our own OpenGL context for the current thread.
// Here we create a fake 1x1 'pixel buffer' surface.
// We don't expecting to run vertex or fragment shaders.
const EGLint surfaceAttr[] = {EGL_WIDTH, 1, EGL_HEIGHT, 1, EGL_NONE};
surface = eglCreatePbufferSurface(display, eglConfig, surfaceAttr);
// Bind context, draw and surface to current thread
eglMakeCurrent(display, surface, surface, context);
// Bind the API for this context. In our case we want to use OpenGL_ES
eglBindAPI(EGL_OPENGL_ES_API);
return context;
}
bool AndroidGLContext::opengl_thread_context_exists() {
return eglGetCurrentContext() != EGL_NO_CONTEXT;
}
bool AndroidGLContext::release_opengl_thread_context() {
EGLContext display = eglGetCurrentDisplay();
if (display != EGL_NO_DISPLAY) {
if (_eglcontext != EGL_NO_CONTEXT) {
eglDestroyContext(display, _eglcontext);
_eglcontext = EGL_NO_CONTEXT;
}
EGLSurface surface = eglGetCurrentSurface(EGL_DRAW);
if (surface != EGL_NO_SURFACE) {
eglDestroySurface(display, surface);
surface = EGL_NO_SURFACE;
}
surface = eglGetCurrentSurface(EGL_READ);
if (surface != EGL_NO_SURFACE) {
eglDestroySurface(display, surface);
surface = EGL_NO_SURFACE;
}
eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
eglTerminate(display);
display = EGL_NO_DISPLAY;
}
eglReleaseThread();
return true;
}
void AndroidGLContext::init_gles3() {
if (!gl3stubInit()) {
CAFFE_THROW("OpenGL ES 3 not initialized");
} else {
LOG(INFO) << "OpenGL ES 3 successfully enabled";
}
}
GL_Renderer AndroidGLContext::get_platform() {
std::string rendererStr((const char*)glGetString(GL_RENDERER));
std::regex regStr("^[A-Za-z]*");
std::smatch matchs;
if (std::regex_search(rendererStr, matchs, regStr)) {
const std::string renderer = *matchs.begin();
auto found = renderer_map().find(renderer);
if (found != renderer_map().end()) {
return found->second;
}
}
CAFFE_THROW("Unsupported GPU renderer");
}
AndroidGLContext::AndroidGLContext() {
if (!opengl_thread_context_exists()) {
_eglcontext = create_opengl_thread_context();
LOG(INFO) << "New EGLContext created";
if (!supportOpenGLES3(&half_float_supported)) {
CAFFE_THROW("OpenGL ES 3 not supported");
}
if (!isSupportedDevice()) {
LOG(ERROR) << "Device not fully supported";
}
} else {
_eglcontext = EGL_NO_CONTEXT;
LOG(INFO) << "Reusing EGLContext, make sure OpenGL ES 3 is supported";
}
static std::once_flag once;
std::call_once(once, [&]() { init_gles3(); });
}
AndroidGLContext::~AndroidGLContext() {
if (_eglcontext != EGL_NO_CONTEXT) {
release_opengl_thread_context();
}
}
void AndroidGLContext::set_context() {}
void AndroidGLContext::reset_context() {}
void AndroidGLContext::flush_context() {}

View File

@ -1,26 +0,0 @@
#pragma once
#include "../core/GLContext.h"
#include "../core/GLTexture.h"
#include <unordered_map>
enum GL_Renderer { Adreno, Mali, Tegra /*, PowerVR */ };
class AndroidGLContext : public GLContext {
private:
EGLContext _eglcontext;
EGLContext create_opengl_thread_context();
bool opengl_thread_context_exists();
bool release_opengl_thread_context();
public:
AndroidGLContext();
~AndroidGLContext();
void set_context();
void reset_context();
void flush_context();
void init_gles3();
GL_Renderer get_platform();
};

View File

@ -1,2 +0,0 @@
file(GLOB_RECURSE tmp *.cc *.c)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,19 +0,0 @@
#include "AndroidGLContext.h"
std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
void GLContext::initGLContext() {
if (_glcontext == nullptr) {
_glcontext.reset(new AndroidGLContext());
}
}
GLContext* GLContext::getGLContext() {
if (_glcontext == nullptr) {
initGLContext();
}
return _glcontext.get();
}
void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }

View File

@ -1,11 +0,0 @@
#include "../core/GLImageAllocator.h"
#include "../core/arm_neon_support.h"
template <typename T>
GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
return new GLImageAllocator<T>();
}
template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();

View File

@ -1,5 +0,0 @@
#pragma once
#include <arm_neon.h>
typedef __fp16 float16_t;

View File

@ -1,357 +0,0 @@
// clang-format off
#include <EGL/egl.h>
#include "gl3stub.h"
GLboolean gl3stubInit() {
#define FIND_PROC(s) s = (void*)eglGetProcAddress(#s)
FIND_PROC(glReadBuffer);
FIND_PROC(glDrawRangeElements);
FIND_PROC(glTexImage3D);
FIND_PROC(glTexSubImage3D);
FIND_PROC(glCopyTexSubImage3D);
FIND_PROC(glCompressedTexImage3D);
FIND_PROC(glCompressedTexSubImage3D);
FIND_PROC(glGenQueries);
FIND_PROC(glDeleteQueries);
FIND_PROC(glIsQuery);
FIND_PROC(glBeginQuery);
FIND_PROC(glEndQuery);
FIND_PROC(glGetQueryiv);
FIND_PROC(glGetQueryObjectuiv);
FIND_PROC(glUnmapBuffer);
FIND_PROC(glGetBufferPointerv);
FIND_PROC(glDrawBuffers);
FIND_PROC(glUniformMatrix2x3fv);
FIND_PROC(glUniformMatrix3x2fv);
FIND_PROC(glUniformMatrix2x4fv);
FIND_PROC(glUniformMatrix4x2fv);
FIND_PROC(glUniformMatrix3x4fv);
FIND_PROC(glUniformMatrix4x3fv);
FIND_PROC(glBlitFramebuffer);
FIND_PROC(glRenderbufferStorageMultisample);
FIND_PROC(glFramebufferTextureLayer);
FIND_PROC(glMapBufferRange);
FIND_PROC(glFlushMappedBufferRange);
FIND_PROC(glBindVertexArray);
FIND_PROC(glDeleteVertexArrays);
FIND_PROC(glGenVertexArrays);
FIND_PROC(glIsVertexArray);
FIND_PROC(glGetIntegeri_v);
FIND_PROC(glBeginTransformFeedback);
FIND_PROC(glEndTransformFeedback);
FIND_PROC(glBindBufferRange);
FIND_PROC(glBindBufferBase);
FIND_PROC(glTransformFeedbackVaryings);
FIND_PROC(glGetTransformFeedbackVarying);
FIND_PROC(glVertexAttribIPointer);
FIND_PROC(glGetVertexAttribIiv);
FIND_PROC(glGetVertexAttribIuiv);
FIND_PROC(glVertexAttribI4i);
FIND_PROC(glVertexAttribI4ui);
FIND_PROC(glVertexAttribI4iv);
FIND_PROC(glVertexAttribI4uiv);
FIND_PROC(glGetUniformuiv);
FIND_PROC(glGetFragDataLocation);
FIND_PROC(glUniform1ui);
FIND_PROC(glUniform2ui);
FIND_PROC(glUniform3ui);
FIND_PROC(glUniform4ui);
FIND_PROC(glUniform1uiv);
FIND_PROC(glUniform2uiv);
FIND_PROC(glUniform3uiv);
FIND_PROC(glUniform4uiv);
FIND_PROC(glClearBufferiv);
FIND_PROC(glClearBufferuiv);
FIND_PROC(glClearBufferfv);
FIND_PROC(glClearBufferfi);
FIND_PROC(glGetStringi);
FIND_PROC(glCopyBufferSubData);
FIND_PROC(glGetUniformIndices);
FIND_PROC(glGetActiveUniformsiv);
FIND_PROC(glGetUniformBlockIndex);
FIND_PROC(glGetActiveUniformBlockiv);
FIND_PROC(glGetActiveUniformBlockName);
FIND_PROC(glUniformBlockBinding);
FIND_PROC(glDrawArraysInstanced);
FIND_PROC(glDrawElementsInstanced);
FIND_PROC(glFenceSync);
FIND_PROC(glIsSync);
FIND_PROC(glDeleteSync);
FIND_PROC(glClientWaitSync);
FIND_PROC(glWaitSync);
FIND_PROC(glGetInteger64v);
FIND_PROC(glGetSynciv);
FIND_PROC(glGetInteger64i_v);
FIND_PROC(glGetBufferParameteri64v);
FIND_PROC(glGenSamplers);
FIND_PROC(glDeleteSamplers);
FIND_PROC(glIsSampler);
FIND_PROC(glBindSampler);
FIND_PROC(glSamplerParameteri);
FIND_PROC(glSamplerParameteriv);
FIND_PROC(glSamplerParameterf);
FIND_PROC(glSamplerParameterfv);
FIND_PROC(glGetSamplerParameteriv);
FIND_PROC(glGetSamplerParameterfv);
FIND_PROC(glVertexAttribDivisor);
FIND_PROC(glBindTransformFeedback);
FIND_PROC(glDeleteTransformFeedbacks);
FIND_PROC(glGenTransformFeedbacks);
FIND_PROC(glIsTransformFeedback);
FIND_PROC(glPauseTransformFeedback);
FIND_PROC(glResumeTransformFeedback);
FIND_PROC(glGetProgramBinary);
FIND_PROC(glProgramBinary);
FIND_PROC(glProgramParameteri);
FIND_PROC(glInvalidateFramebuffer);
FIND_PROC(glInvalidateSubFramebuffer);
FIND_PROC(glTexStorage2D);
FIND_PROC(glTexStorage3D);
FIND_PROC(glGetInternalformativ);
// Bind GL_EXT_texture_border_clamp
FIND_PROC(glTexParameterIivEXT);
FIND_PROC(glTexParameterIuivEXT);
FIND_PROC(glGetTexParameterIivEXT);
FIND_PROC(glGetTexParameterIuivEXT);
FIND_PROC(glSamplerParameterIivEXT);
FIND_PROC(glSamplerParameterIuivEXT);
FIND_PROC(glGetSamplerParameterIivEXT);
FIND_PROC(glGetSamplerParameterIuivEXT);
#undef FIND_PROC
if (!glReadBuffer ||
!glDrawRangeElements ||
!glTexImage3D ||
!glTexSubImage3D ||
!glCopyTexSubImage3D ||
!glCompressedTexImage3D ||
!glCompressedTexSubImage3D ||
!glGenQueries ||
!glDeleteQueries ||
!glIsQuery ||
!glBeginQuery ||
!glEndQuery ||
!glGetQueryiv ||
!glGetQueryObjectuiv ||
!glUnmapBuffer ||
!glGetBufferPointerv ||
!glDrawBuffers ||
!glUniformMatrix2x3fv ||
!glUniformMatrix3x2fv ||
!glUniformMatrix2x4fv ||
!glUniformMatrix4x2fv ||
!glUniformMatrix3x4fv ||
!glUniformMatrix4x3fv ||
!glBlitFramebuffer ||
!glRenderbufferStorageMultisample ||
!glFramebufferTextureLayer ||
!glMapBufferRange ||
!glFlushMappedBufferRange ||
!glBindVertexArray ||
!glDeleteVertexArrays ||
!glGenVertexArrays ||
!glIsVertexArray ||
!glGetIntegeri_v ||
!glBeginTransformFeedback ||
!glEndTransformFeedback ||
!glBindBufferRange ||
!glBindBufferBase ||
!glTransformFeedbackVaryings ||
!glGetTransformFeedbackVarying ||
!glVertexAttribIPointer ||
!glGetVertexAttribIiv ||
!glGetVertexAttribIuiv ||
!glVertexAttribI4i ||
!glVertexAttribI4ui ||
!glVertexAttribI4iv ||
!glVertexAttribI4uiv ||
!glGetUniformuiv ||
!glGetFragDataLocation ||
!glUniform1ui ||
!glUniform2ui ||
!glUniform3ui ||
!glUniform4ui ||
!glUniform1uiv ||
!glUniform2uiv ||
!glUniform3uiv ||
!glUniform4uiv ||
!glClearBufferiv ||
!glClearBufferuiv ||
!glClearBufferfv ||
!glClearBufferfi ||
!glGetStringi ||
!glCopyBufferSubData ||
!glGetUniformIndices ||
!glGetActiveUniformsiv ||
!glGetUniformBlockIndex ||
!glGetActiveUniformBlockiv ||
!glGetActiveUniformBlockName ||
!glUniformBlockBinding ||
!glDrawArraysInstanced ||
!glDrawElementsInstanced ||
!glFenceSync ||
!glIsSync ||
!glDeleteSync ||
!glClientWaitSync ||
!glWaitSync ||
!glGetInteger64v ||
!glGetSynciv ||
!glGetInteger64i_v ||
!glGetBufferParameteri64v ||
!glGenSamplers ||
!glDeleteSamplers ||
!glIsSampler ||
!glBindSampler ||
!glSamplerParameteri ||
!glSamplerParameteriv ||
!glSamplerParameterf ||
!glSamplerParameterfv ||
!glGetSamplerParameteriv ||
!glGetSamplerParameterfv ||
!glVertexAttribDivisor ||
!glBindTransformFeedback ||
!glDeleteTransformFeedbacks ||
!glGenTransformFeedbacks ||
!glIsTransformFeedback ||
!glPauseTransformFeedback ||
!glResumeTransformFeedback ||
!glGetProgramBinary ||
!glProgramBinary ||
!glProgramParameteri ||
!glInvalidateFramebuffer ||
!glInvalidateSubFramebuffer ||
!glTexStorage2D ||
!glTexStorage3D ||
!glGetInternalformativ)
{
return GL_FALSE;
}
return GL_TRUE;
}
/* Function pointer definitions */
GL_APICALL void (* GL_APIENTRY glReadBuffer) (GLenum mode);
GL_APICALL void (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
GL_APICALL void (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
GL_APICALL void (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
GL_APICALL void (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
GL_APICALL void (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
GL_APICALL void (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
GL_APICALL void (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
GL_APICALL void (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
GL_APICALL GLboolean (* GL_APIENTRY glIsQuery) (GLuint id);
GL_APICALL void (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
GL_APICALL void (* GL_APIENTRY glEndQuery) (GLenum target);
GL_APICALL void (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
GL_APICALL void (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
GL_APICALL GLboolean (* GL_APIENTRY glUnmapBuffer) (GLenum target);
GL_APICALL void (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
GL_APICALL void (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
GL_APICALL void (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
GL_APICALL void (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
GL_APICALL void (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
GL_APICALL GLvoid* (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
GL_APICALL void (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
GL_APICALL void (* GL_APIENTRY glBindVertexArray) (GLuint array);
GL_APICALL void (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
GL_APICALL void (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
GL_APICALL GLboolean (* GL_APIENTRY glIsVertexArray) (GLuint array);
GL_APICALL void (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
GL_APICALL void (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
GL_APICALL void (* GL_APIENTRY glEndTransformFeedback) (void);
GL_APICALL void (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
GL_APICALL void (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
GL_APICALL void (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
GL_APICALL void (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
GL_APICALL void (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
GL_APICALL void (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
GL_APICALL void (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
GL_APICALL void (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
GL_APICALL void (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
GL_APICALL void (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
GL_APICALL void (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
GL_APICALL void (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
GL_APICALL GLint (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
GL_APICALL void (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
GL_APICALL void (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
GL_APICALL void (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
GL_APICALL void (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
GL_APICALL void (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
GL_APICALL void (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
GL_APICALL void (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
GL_APICALL void (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
GL_APICALL void (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
GL_APICALL void (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
GL_APICALL void (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
GL_APICALL void (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
GL_APICALL void (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
GL_APICALL void (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
GL_APICALL void (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
GL_APICALL GLuint (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
GL_APICALL void (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
GL_APICALL void (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
GL_APICALL void (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
GL_APICALL GLsync (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
GL_APICALL GLboolean (* GL_APIENTRY glIsSync) (GLsync sync);
GL_APICALL void (* GL_APIENTRY glDeleteSync) (GLsync sync);
GL_APICALL GLenum (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
GL_APICALL void (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
GL_APICALL void (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
GL_APICALL void (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
GL_APICALL void (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
GL_APICALL void (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
GL_APICALL void (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
GL_APICALL void (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
GL_APICALL GLboolean (* GL_APIENTRY glIsSampler) (GLuint sampler);
GL_APICALL void (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
GL_APICALL void (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
GL_APICALL void (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
GL_APICALL void (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
GL_APICALL void (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
GL_APICALL void (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
GL_APICALL void (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
GL_APICALL void (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
GL_APICALL void (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
GL_APICALL void (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
GL_APICALL void (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
GL_APICALL GLboolean (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
GL_APICALL void (* GL_APIENTRY glPauseTransformFeedback) (void);
GL_APICALL void (* GL_APIENTRY glResumeTransformFeedback) (void);
GL_APICALL void (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
GL_APICALL void (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
GL_APICALL void (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
GL_APICALL void (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
GL_APICALL void (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
GL_APICALL void (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
GL_APICALL void (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
GL_APICALL void (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
// GL_EXT_texture_border_clamp
GL_APICALL void (* GL_APIENTRY glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
GL_APICALL void (* GL_APIENTRY glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
GL_APICALL void (* GL_APIENTRY glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
GL_APICALL void (* GL_APIENTRY glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
GL_APICALL void (* GL_APIENTRY glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
GL_APICALL void (* GL_APIENTRY glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
// End GL_EXT_texture_border_clamp
// clang-format on

View File

@ -1,488 +0,0 @@
#ifndef __gl3_h_
#define __gl3_h_
/*
* stub gl3.h for dynamic loading, based on:
* gl3.h last updated on $Date: 2013-02-12 14:37:24 -0800 (Tue, 12 Feb 2013) $
*
* Changes:
* - Added #include <GLES2/gl2.h>
* - Removed duplicate OpenGL ES 2.0 declarations
* - Converted OpenGL ES 3.0 function prototypes to function pointer
* declarations
* - Added gl3stubInit() declaration
*/
#include <GLES2/gl2.h>
#include <android/api-level.h>
// clang-format off
#ifdef __cplusplus
extern "C" {
#endif
/* Call this function before calling any OpenGL ES 3.0 functions. It will
* return GL_TRUE if the OpenGL ES 3.0 was successfully initialized, GL_FALSE
* otherwise. */
GLboolean gl3stubInit();
/*-------------------------------------------------------------------------
* Data type definitions
*-----------------------------------------------------------------------*/
/* OpenGL ES 3.0 */
typedef unsigned short GLhalf;
#if __ANDROID_API__ <= 19
typedef khronos_int64_t GLint64;
typedef khronos_uint64_t GLuint64;
typedef struct __GLsync *GLsync;
#endif
/*-------------------------------------------------------------------------
* Token definitions
*-----------------------------------------------------------------------*/
/* OpenGL ES core versions */
#define GL_ES_VERSION_3_0 1
/* OpenGL ES 3.0 */
#define GL_READ_BUFFER 0x0C02
#define GL_UNPACK_ROW_LENGTH 0x0CF2
#define GL_UNPACK_SKIP_ROWS 0x0CF3
#define GL_UNPACK_SKIP_PIXELS 0x0CF4
#define GL_PACK_ROW_LENGTH 0x0D02
#define GL_PACK_SKIP_ROWS 0x0D03
#define GL_PACK_SKIP_PIXELS 0x0D04
#define GL_COLOR 0x1800
#define GL_DEPTH 0x1801
#define GL_STENCIL 0x1802
#define GL_RED 0x1903
#define GL_RGB8 0x8051
#define GL_RGBA8 0x8058
#define GL_RGB10_A2 0x8059
#define GL_TEXTURE_BINDING_3D 0x806A
#define GL_UNPACK_SKIP_IMAGES 0x806D
#define GL_UNPACK_IMAGE_HEIGHT 0x806E
#define GL_TEXTURE_3D 0x806F
#define GL_TEXTURE_WRAP_R 0x8072
#define GL_MAX_3D_TEXTURE_SIZE 0x8073
#define GL_UNSIGNED_INT_2_10_10_10_REV 0x8368
#define GL_MAX_ELEMENTS_VERTICES 0x80E8
#define GL_MAX_ELEMENTS_INDICES 0x80E9
#define GL_TEXTURE_MIN_LOD 0x813A
#define GL_TEXTURE_MAX_LOD 0x813B
#define GL_TEXTURE_BASE_LEVEL 0x813C
#define GL_TEXTURE_MAX_LEVEL 0x813D
#define GL_MIN 0x8007
#define GL_MAX 0x8008
#define GL_DEPTH_COMPONENT24 0x81A6
#define GL_MAX_TEXTURE_LOD_BIAS 0x84FD
#define GL_TEXTURE_COMPARE_MODE 0x884C
#define GL_TEXTURE_COMPARE_FUNC 0x884D
#define GL_CURRENT_QUERY 0x8865
#define GL_QUERY_RESULT 0x8866
#define GL_QUERY_RESULT_AVAILABLE 0x8867
#define GL_BUFFER_MAPPED 0x88BC
#define GL_BUFFER_MAP_POINTER 0x88BD
#define GL_STREAM_READ 0x88E1
#define GL_STREAM_COPY 0x88E2
#define GL_STATIC_READ 0x88E5
#define GL_STATIC_COPY 0x88E6
#define GL_DYNAMIC_READ 0x88E9
#define GL_DYNAMIC_COPY 0x88EA
#define GL_MAX_DRAW_BUFFERS 0x8824
#define GL_DRAW_BUFFER0 0x8825
#define GL_DRAW_BUFFER1 0x8826
#define GL_DRAW_BUFFER2 0x8827
#define GL_DRAW_BUFFER3 0x8828
#define GL_DRAW_BUFFER4 0x8829
#define GL_DRAW_BUFFER5 0x882A
#define GL_DRAW_BUFFER6 0x882B
#define GL_DRAW_BUFFER7 0x882C
#define GL_DRAW_BUFFER8 0x882D
#define GL_DRAW_BUFFER9 0x882E
#define GL_DRAW_BUFFER10 0x882F
#define GL_DRAW_BUFFER11 0x8830
#define GL_DRAW_BUFFER12 0x8831
#define GL_DRAW_BUFFER13 0x8832
#define GL_DRAW_BUFFER14 0x8833
#define GL_DRAW_BUFFER15 0x8834
#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS 0x8B49
#define GL_MAX_VERTEX_UNIFORM_COMPONENTS 0x8B4A
#define GL_SAMPLER_3D 0x8B5F
#define GL_SAMPLER_2D_SHADOW 0x8B62
#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT 0x8B8B
#define GL_PIXEL_PACK_BUFFER 0x88EB
#define GL_PIXEL_UNPACK_BUFFER 0x88EC
#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED
#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF
#define GL_FLOAT_MAT2x3 0x8B65
#define GL_FLOAT_MAT2x4 0x8B66
#define GL_FLOAT_MAT3x2 0x8B67
#define GL_FLOAT_MAT3x4 0x8B68
#define GL_FLOAT_MAT4x2 0x8B69
#define GL_FLOAT_MAT4x3 0x8B6A
#define GL_SRGB 0x8C40
#define GL_SRGB8 0x8C41
#define GL_SRGB8_ALPHA8 0x8C43
#define GL_COMPARE_REF_TO_TEXTURE 0x884E
#define GL_MAJOR_VERSION 0x821B
#define GL_MINOR_VERSION 0x821C
#define GL_NUM_EXTENSIONS 0x821D
#define GL_RGBA32F 0x8814
#define GL_RGB32F 0x8815
#define GL_RGBA16F 0x881A
#define GL_RGB16F 0x881B
#define GL_VERTEX_ATTRIB_ARRAY_INTEGER 0x88FD
#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF
#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904
#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905
#define GL_MAX_VARYING_COMPONENTS 0x8B4B
#define GL_TEXTURE_2D_ARRAY 0x8C1A
#define GL_TEXTURE_BINDING_2D_ARRAY 0x8C1D
#define GL_R11F_G11F_B10F 0x8C3A
#define GL_UNSIGNED_INT_10F_11F_11F_REV 0x8C3B
#define GL_RGB9_E5 0x8C3D
#define GL_UNSIGNED_INT_5_9_9_9_REV 0x8C3E
#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH 0x8C76
#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE 0x8C7F
#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS 0x8C80
#define GL_TRANSFORM_FEEDBACK_VARYINGS 0x8C83
#define GL_TRANSFORM_FEEDBACK_BUFFER_START 0x8C84
#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE 0x8C85
#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN 0x8C88
#define GL_RASTERIZER_DISCARD 0x8C89
#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A
#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS 0x8C8B
#define GL_INTERLEAVED_ATTRIBS 0x8C8C
#define GL_SEPARATE_ATTRIBS 0x8C8D
#define GL_TRANSFORM_FEEDBACK_BUFFER 0x8C8E
#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING 0x8C8F
#define GL_RGBA32UI 0x8D70
#define GL_RGB32UI 0x8D71
#define GL_RGBA16UI 0x8D76
#define GL_RGB16UI 0x8D77
#define GL_RGBA8UI 0x8D7C
#define GL_RGB8UI 0x8D7D
#define GL_RGBA32I 0x8D82
#define GL_RGB32I 0x8D83
#define GL_RGBA16I 0x8D88
#define GL_RGB16I 0x8D89
#define GL_RGBA8I 0x8D8E
#define GL_RGB8I 0x8D8F
#define GL_RED_INTEGER 0x8D94
#define GL_RGB_INTEGER 0x8D98
#define GL_RGBA_INTEGER 0x8D99
#define GL_SAMPLER_2D_ARRAY 0x8DC1
#define GL_SAMPLER_2D_ARRAY_SHADOW 0x8DC4
#define GL_SAMPLER_CUBE_SHADOW 0x8DC5
#define GL_UNSIGNED_INT_VEC2 0x8DC6
#define GL_UNSIGNED_INT_VEC3 0x8DC7
#define GL_UNSIGNED_INT_VEC4 0x8DC8
#define GL_INT_SAMPLER_2D 0x8DCA
#define GL_INT_SAMPLER_3D 0x8DCB
#define GL_INT_SAMPLER_CUBE 0x8DCC
#define GL_INT_SAMPLER_2D_ARRAY 0x8DCF
#define GL_UNSIGNED_INT_SAMPLER_2D 0x8DD2
#define GL_UNSIGNED_INT_SAMPLER_3D 0x8DD3
#define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4
#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7
#define GL_BUFFER_ACCESS_FLAGS 0x911F
#define GL_BUFFER_MAP_LENGTH 0x9120
#define GL_BUFFER_MAP_OFFSET 0x9121
#define GL_DEPTH_COMPONENT32F 0x8CAC
#define GL_DEPTH32F_STENCIL8 0x8CAD
#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV 0x8DAD
#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING 0x8210
#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE 0x8211
#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE 0x8212
#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE 0x8213
#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE 0x8214
#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE 0x8215
#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE 0x8216
#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE 0x8217
#define GL_FRAMEBUFFER_DEFAULT 0x8218
#define GL_FRAMEBUFFER_UNDEFINED 0x8219
#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
#define GL_DEPTH_STENCIL 0x84F9
#define GL_UNSIGNED_INT_24_8 0x84FA
#define GL_DEPTH24_STENCIL8 0x88F0
#define GL_UNSIGNED_NORMALIZED 0x8C17
#define GL_DRAW_FRAMEBUFFER_BINDING GL_FRAMEBUFFER_BINDING
#define GL_READ_FRAMEBUFFER 0x8CA8
#define GL_DRAW_FRAMEBUFFER 0x8CA9
#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA
#define GL_RENDERBUFFER_SAMPLES 0x8CAB
#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER 0x8CD4
#define GL_MAX_COLOR_ATTACHMENTS 0x8CDF
#define GL_COLOR_ATTACHMENT1 0x8CE1
#define GL_COLOR_ATTACHMENT2 0x8CE2
#define GL_COLOR_ATTACHMENT3 0x8CE3
#define GL_COLOR_ATTACHMENT4 0x8CE4
#define GL_COLOR_ATTACHMENT5 0x8CE5
#define GL_COLOR_ATTACHMENT6 0x8CE6
#define GL_COLOR_ATTACHMENT7 0x8CE7
#define GL_COLOR_ATTACHMENT8 0x8CE8
#define GL_COLOR_ATTACHMENT9 0x8CE9
#define GL_COLOR_ATTACHMENT10 0x8CEA
#define GL_COLOR_ATTACHMENT11 0x8CEB
#define GL_COLOR_ATTACHMENT12 0x8CEC
#define GL_COLOR_ATTACHMENT13 0x8CED
#define GL_COLOR_ATTACHMENT14 0x8CEE
#define GL_COLOR_ATTACHMENT15 0x8CEF
#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE 0x8D56
#define GL_MAX_SAMPLES 0x8D57
#define GL_HALF_FLOAT 0x140B
#define GL_MAP_READ_BIT 0x0001
#define GL_MAP_WRITE_BIT 0x0002
#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004
#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008
#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010
#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020
#define GL_RG 0x8227
#define GL_RG_INTEGER 0x8228
#define GL_R8 0x8229
#define GL_RG8 0x822B
#define GL_R16F 0x822D
#define GL_R32F 0x822E
#define GL_RG16F 0x822F
#define GL_RG32F 0x8230
#define GL_R8I 0x8231
#define GL_R8UI 0x8232
#define GL_R16I 0x8233
#define GL_R16UI 0x8234
#define GL_R32I 0x8235
#define GL_R32UI 0x8236
#define GL_RG8I 0x8237
#define GL_RG8UI 0x8238
#define GL_RG16I 0x8239
#define GL_RG16UI 0x823A
#define GL_RG32I 0x823B
#define GL_RG32UI 0x823C
#define GL_VERTEX_ARRAY_BINDING 0x85B5
#define GL_R8_SNORM 0x8F94
#define GL_RG8_SNORM 0x8F95
#define GL_RGB8_SNORM 0x8F96
#define GL_RGBA8_SNORM 0x8F97
#define GL_SIGNED_NORMALIZED 0x8F9C
#define GL_PRIMITIVE_RESTART_FIXED_INDEX 0x8D69
#define GL_COPY_READ_BUFFER 0x8F36
#define GL_COPY_WRITE_BUFFER 0x8F37
#define GL_COPY_READ_BUFFER_BINDING GL_COPY_READ_BUFFER
#define GL_COPY_WRITE_BUFFER_BINDING GL_COPY_WRITE_BUFFER
#define GL_UNIFORM_BUFFER 0x8A11
#define GL_UNIFORM_BUFFER_BINDING 0x8A28
#define GL_UNIFORM_BUFFER_START 0x8A29
#define GL_UNIFORM_BUFFER_SIZE 0x8A2A
#define GL_MAX_VERTEX_UNIFORM_BLOCKS 0x8A2B
#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS 0x8A2D
#define GL_MAX_COMBINED_UNIFORM_BLOCKS 0x8A2E
#define GL_MAX_UNIFORM_BUFFER_BINDINGS 0x8A2F
#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30
#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS 0x8A31
#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS 0x8A33
#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT 0x8A34
#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH 0x8A35
#define GL_ACTIVE_UNIFORM_BLOCKS 0x8A36
#define GL_UNIFORM_TYPE 0x8A37
#define GL_UNIFORM_SIZE 0x8A38
#define GL_UNIFORM_NAME_LENGTH 0x8A39
#define GL_UNIFORM_BLOCK_INDEX 0x8A3A
#define GL_UNIFORM_OFFSET 0x8A3B
#define GL_UNIFORM_ARRAY_STRIDE 0x8A3C
#define GL_UNIFORM_MATRIX_STRIDE 0x8A3D
#define GL_UNIFORM_IS_ROW_MAJOR 0x8A3E
#define GL_UNIFORM_BLOCK_BINDING 0x8A3F
#define GL_UNIFORM_BLOCK_DATA_SIZE 0x8A40
#define GL_UNIFORM_BLOCK_NAME_LENGTH 0x8A41
#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS 0x8A42
#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES 0x8A43
#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER 0x8A44
#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER 0x8A46
#define GL_INVALID_INDEX 0xFFFFFFFFu
#define GL_MAX_VERTEX_OUTPUT_COMPONENTS 0x9122
#define GL_MAX_FRAGMENT_INPUT_COMPONENTS 0x9125
#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111
#define GL_OBJECT_TYPE 0x9112
#define GL_SYNC_CONDITION 0x9113
#define GL_SYNC_STATUS 0x9114
#define GL_SYNC_FLAGS 0x9115
#define GL_SYNC_FENCE 0x9116
#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117
#define GL_UNSIGNALED 0x9118
#define GL_SIGNALED 0x9119
#define GL_ALREADY_SIGNALED 0x911A
#define GL_TIMEOUT_EXPIRED 0x911B
#define GL_CONDITION_SATISFIED 0x911C
#define GL_WAIT_FAILED 0x911D
#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001
#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFFull
#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR 0x88FE
#define GL_ANY_SAMPLES_PASSED 0x8C2F
#define GL_ANY_SAMPLES_PASSED_CONSERVATIVE 0x8D6A
#define GL_SAMPLER_BINDING 0x8919
#define GL_RGB10_A2UI 0x906F
#define GL_TEXTURE_SWIZZLE_R 0x8E42
#define GL_TEXTURE_SWIZZLE_G 0x8E43
#define GL_TEXTURE_SWIZZLE_B 0x8E44
#define GL_TEXTURE_SWIZZLE_A 0x8E45
#define GL_GREEN 0x1904
#define GL_BLUE 0x1905
#define GL_INT_2_10_10_10_REV 0x8D9F
#define GL_TRANSFORM_FEEDBACK 0x8E22
#define GL_TRANSFORM_FEEDBACK_PAUSED 0x8E23
#define GL_TRANSFORM_FEEDBACK_ACTIVE 0x8E24
#define GL_TRANSFORM_FEEDBACK_BINDING 0x8E25
#define GL_PROGRAM_BINARY_RETRIEVABLE_HINT 0x8257
#define GL_PROGRAM_BINARY_LENGTH 0x8741
#define GL_NUM_PROGRAM_BINARY_FORMATS 0x87FE
#define GL_PROGRAM_BINARY_FORMATS 0x87FF
#define GL_COMPRESSED_R11_EAC 0x9270
#define GL_COMPRESSED_SIGNED_R11_EAC 0x9271
#define GL_COMPRESSED_RG11_EAC 0x9272
#define GL_COMPRESSED_SIGNED_RG11_EAC 0x9273
#define GL_COMPRESSED_RGB8_ETC2 0x9274
#define GL_COMPRESSED_SRGB8_ETC2 0x9275
#define GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 0x9276
#define GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2 0x9277
#define GL_COMPRESSED_RGBA8_ETC2_EAC 0x9278
#define GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC 0x9279
#define GL_TEXTURE_IMMUTABLE_FORMAT 0x912F
#define GL_MAX_ELEMENT_INDEX 0x8D6B
#define GL_NUM_SAMPLE_COUNTS 0x9380
#define GL_TEXTURE_IMMUTABLE_LEVELS 0x82DF
/*-------------------------------------------------------------------------
* Entrypoint definitions
*-----------------------------------------------------------------------*/
/* OpenGL ES 3.0 */
extern GL_APICALL void (* GL_APIENTRY glReadBuffer) (GLenum mode);
extern GL_APICALL void (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
extern GL_APICALL void (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
extern GL_APICALL void (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
extern GL_APICALL void (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
extern GL_APICALL void (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
extern GL_APICALL void (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
extern GL_APICALL void (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
extern GL_APICALL void (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
extern GL_APICALL GLboolean (* GL_APIENTRY glIsQuery) (GLuint id);
extern GL_APICALL void (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
extern GL_APICALL void (* GL_APIENTRY glEndQuery) (GLenum target);
extern GL_APICALL void (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
extern GL_APICALL void (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
extern GL_APICALL GLboolean (* GL_APIENTRY glUnmapBuffer) (GLenum target);
extern GL_APICALL void (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
extern GL_APICALL void (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
extern GL_APICALL void (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
extern GL_APICALL void (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
extern GL_APICALL GLvoid* (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
extern GL_APICALL void (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
extern GL_APICALL void (* GL_APIENTRY glBindVertexArray) (GLuint array);
extern GL_APICALL void (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
extern GL_APICALL void (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
extern GL_APICALL GLboolean (* GL_APIENTRY glIsVertexArray) (GLuint array);
extern GL_APICALL void (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
extern GL_APICALL void (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
extern GL_APICALL void (* GL_APIENTRY glEndTransformFeedback) (void);
extern GL_APICALL void (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
extern GL_APICALL void (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
extern GL_APICALL void (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
extern GL_APICALL void (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
extern GL_APICALL void (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
extern GL_APICALL void (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
extern GL_APICALL void (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
extern GL_APICALL void (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
extern GL_APICALL GLint (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
extern GL_APICALL void (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
extern GL_APICALL void (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
extern GL_APICALL void (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
extern GL_APICALL void (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
extern GL_APICALL void (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
extern GL_APICALL void (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
extern GL_APICALL void (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
extern GL_APICALL void (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
extern GL_APICALL void (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
extern GL_APICALL void (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
extern GL_APICALL void (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
extern GL_APICALL void (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
extern GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
extern GL_APICALL void (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
extern GL_APICALL void (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
extern GL_APICALL GLuint (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
extern GL_APICALL void (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
extern GL_APICALL void (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
extern GL_APICALL void (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
extern GL_APICALL GLsync (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
extern GL_APICALL GLboolean (* GL_APIENTRY glIsSync) (GLsync sync);
extern GL_APICALL void (* GL_APIENTRY glDeleteSync) (GLsync sync);
extern GL_APICALL GLenum (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
extern GL_APICALL void (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
extern GL_APICALL void (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
extern GL_APICALL void (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
extern GL_APICALL void (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
extern GL_APICALL void (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
extern GL_APICALL void (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
extern GL_APICALL void (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
extern GL_APICALL GLboolean (* GL_APIENTRY glIsSampler) (GLuint sampler);
extern GL_APICALL void (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
extern GL_APICALL void (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
extern GL_APICALL void (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
extern GL_APICALL void (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
extern GL_APICALL void (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
extern GL_APICALL void (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
extern GL_APICALL void (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
extern GL_APICALL GLboolean (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
extern GL_APICALL void (* GL_APIENTRY glPauseTransformFeedback) (void);
extern GL_APICALL void (* GL_APIENTRY glResumeTransformFeedback) (void);
extern GL_APICALL void (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
extern GL_APICALL void (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
extern GL_APICALL void (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
extern GL_APICALL void (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
extern GL_APICALL void (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
extern GL_APICALL void (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
extern GL_APICALL void (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
extern GL_APICALL void (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
#ifndef GL_EXT_texture_border_clamp
#define GL_EXT_texture_border_clamp 1
#define GL_TEXTURE_BORDER_COLOR_EXT 0x1004
#define GL_CLAMP_TO_BORDER_EXT 0x812D
extern GL_APICALL void (* GL_APIENTRY glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
extern GL_APICALL void (* GL_APIENTRY glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
extern GL_APICALL void (* GL_APIENTRY glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
extern GL_APICALL void (* GL_APIENTRY glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
#endif /* GL_EXT_texture_border_clamp */
#ifdef __cplusplus
}
#endif
// clang-format on
#endif

View File

@ -1,2 +0,0 @@
file(GLOB_RECURSE tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,249 +0,0 @@
#include "DataTransfer.h"
#include "GLLogging.h"
#include "caffe2/core/common.h"
inline uint16x4x4_t vld4_u16_aligned16(const uint16_t* address) {
return vld4_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 16)));
}
inline uint16x4_t vld1_u16_aligned8(const uint16_t* address) {
return vld1_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 8)));
}
inline void vst4_u16_aligned16(uint16_t* address, uint16x4x4_t data) {
vst4_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 16)), data);
}
inline void vst1_u16_aligned8(uint16_t* address, uint16x4_t data) {
vst1_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 8)), data);
}
template <int input_channels>
static void interleaveSlice(
void* output, const float* input, size_t width, size_t height, size_t row_stride) {
const float* input_r = input;
const float* input_g = input_r + height * width;
const float* input_b = input_g + height * width;
const float* input_a = input_b + height * width;
uint16_t* output_f16 = static_cast<uint16_t*>(output);
if (width >= 4) {
for (size_t y = 0; y < height; y++) {
size_t nx = width;
while (nx >= 4) {
const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
input_r += 4;
uint16x4_t g, b, a;
g = b = a = vdup_n_u16(0);
if (input_channels >= 2) {
g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
input_g += 4;
if (input_channels >= 3) {
b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
input_b += 4;
if (input_channels >= 4) {
a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
input_a += 4;
}
}
}
const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
vst4_u16_aligned16(output_f16, rgba);
output_f16 += 4 * 4;
nx -= 4;
}
if (nx != 0) {
output_f16 -= (4 - nx) * 4;
input_r -= 4 - nx;
if (input_channels >= 2) {
input_g -= 4 - nx;
if (input_channels >= 3) {
input_b -= 4 - nx;
if (input_channels >= 4) {
input_a -= 4 - nx;
}
}
}
const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
input_r += 4;
uint16x4_t g, b, a;
g = b = a = vdup_n_u16(0);
if (input_channels >= 2) {
g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
input_g += 4;
if (input_channels >= 3) {
b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
input_b += 4;
if (input_channels >= 4) {
a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
input_a += 4;
}
}
}
const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
vst4_u16_aligned16(output_f16, rgba);
output_f16 += 4 * 4;
}
output_f16 += (row_stride - width) * 4;
}
} else {
for (size_t y = 0; y < height; y++) {
for (size_t x = 0; x < width; x++) {
float32x4_t rgba = vld1q_dup_f32(input_r++);
if (input_channels >= 2) {
rgba = vld1q_lane_f32(input_g++, rgba, 1);
if (input_channels >= 3) {
rgba = vld1q_lane_f32(input_b++, rgba, 2);
if (input_channels >= 4) {
rgba = vld1q_lane_f32(input_a++, rgba, 3);
}
}
}
vst1_u16_aligned8(output_f16, uint16x4_t(vcvt_f16_f32(rgba)));
output_f16 += 4;
}
output_f16 += (row_stride - width) * 4;
}
}
}
void interleaveSlice(void* output,
const float* input,
size_t width,
size_t height,
size_t row_stride,
uint16_t input_channels) {
switch (input_channels) {
case 1:
interleaveSlice<1>(output, input, width, height, row_stride);
break;
case 2:
interleaveSlice<2>(output, input, width, height, row_stride);
break;
case 3:
interleaveSlice<3>(output, input, width, height, row_stride);
break;
case 4:
interleaveSlice<4>(output, input, width, height, row_stride);
break;
}
}
template <int output_channels>
static void deInterleaveSlice(
float* output, const void* input, size_t width, size_t height, size_t row_stride) {
float* output_r = output;
float* output_g = output_r + height * width;
float* output_b = output_g + height * width;
float* output_a = output_b + height * width;
const uint16_t* input_f16 = static_cast<const uint16_t*>(input);
if (width >= 4) {
for (size_t y = 0; y < height; y++) {
size_t nx = width;
while (nx >= 4) {
const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
input_f16 += 4 * 4;
const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
vst1q_f32(output_r, r);
output_r += 4;
if (output_channels >= 2) {
const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
vst1q_f32(output_g, g);
output_g += 4;
if (output_channels >= 3) {
const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
vst1q_f32(output_b, b);
output_b += 4;
if (output_channels >= 4) {
const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
vst1q_f32(output_a, a);
output_a += 4;
}
}
}
nx -= 4;
}
if (nx != 0) {
input_f16 -= (4 - nx) * 4;
output_r -= 4 - nx;
if (output_channels >= 2) {
output_g -= 4 - nx;
if (output_channels >= 3) {
output_b -= 4 - nx;
if (output_channels >= 4) {
output_a -= 4 - nx;
}
}
}
const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
input_f16 += 4 * 4;
const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
vst1q_f32(output_r, r);
output_r += 4;
if (output_channels >= 2) {
const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
vst1q_f32(output_g, g);
output_g += 4;
if (output_channels >= 3) {
const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
vst1q_f32(output_b, b);
output_b += 4;
if (output_channels >= 4) {
const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
vst1q_f32(output_a, a);
output_a += 4;
}
}
}
}
input_f16 += (row_stride - width) * 4;
}
} else {
for (size_t y = 0; y < height; y++) {
for (size_t x = 0; x < width; x++) {
const float32x4_t rgba = vcvt_f32_f16(float16x4_t(vld1_u16_aligned8(input_f16)));
input_f16 += 4;
vst1q_lane_f32(output_r++, rgba, 0);
if (output_channels >= 2) {
vst1q_lane_f32(output_g++, rgba, 1);
if (output_channels >= 3) {
vst1q_lane_f32(output_b++, rgba, 2);
if (output_channels >= 4) {
vst1q_lane_f32(output_a++, rgba, 3);
}
}
}
}
input_f16 += (row_stride - width) * 4;
}
}
}
void deInterleaveSlice(float* output,
const void* input,
size_t width,
size_t height,
size_t row_stride,
uint32_t output_channels) {
switch (output_channels) {
case 1:
deInterleaveSlice<1>(output, input, width, height, row_stride);
break;
case 2:
deInterleaveSlice<2>(output, input, width, height, row_stride);
break;
case 3:
deInterleaveSlice<3>(output, input, width, height, row_stride);
break;
case 4:
deInterleaveSlice<4>(output, input, width, height, row_stride);
break;
}
}

View File

@ -1,17 +0,0 @@
#pragma once
#include "arm_neon_support.h"
void interleaveSlice(void* output,
const float* input,
size_t width,
size_t height,
size_t row_stride,
uint16_t input_channels);
void deInterleaveSlice(float* output,
const void* input,
size_t width,
size_t height,
size_t input_stride,
uint32_t output_channels);

View File

@ -1,12 +0,0 @@
#pragma once
#include "caffe2/core/common.h"
#if CAFFE2_IOS
#include <OpenGLES/ES3/gl.h>
#include <OpenGLES/ES3/glext.h>
#elif CAFFE2_ANDROID
#include <EGL/egl.h>
#include <GLES2/gl2.h>
#include "caffe2/mobile/contrib/opengl/android/gl3stub.h"
#endif

View File

@ -1,126 +0,0 @@
#include "caffe2/core/logging.h"
#include "GL.h"
#include "GLContext.h"
#include "GLLogging.h"
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#if CAFFE2_IOS
#include "sys/utsname.h"
#include <regex>
#endif
void getOpenGLESVersion(int& major, int& minor) {
glGetIntegerv(GL_MAJOR_VERSION, &major);
glGetIntegerv(GL_MINOR_VERSION, &minor);
}
bool checkOpenGLExtensions(std::string gl_ext_str) {
static std::unordered_set<std::string> extensions;
if (extensions.empty()) {
const caffe2::string extension_str((const char*)glGetString(GL_EXTENSIONS));
LOG(INFO) << "GL_EXTENSIONS: " << extension_str;
std::stringstream ss(extension_str);
while (!ss.eof()) {
std::string extension;
ss >> extension;
extensions.insert(extension);
}
}
return extensions.count(gl_ext_str) > 0;
}
bool GLContext::GL_EXT_texture_border_clamp_defined() {
static int major = 0, minor = 0;
if (major == 0) {
getOpenGLESVersion(major, minor);
}
if (major == 3 && minor == 2) {
return true;
}
return checkOpenGLExtensions("GL_EXT_texture_border_clamp") || // Most common
checkOpenGLExtensions("GL_OES_texture_border_clamp");
}
bool supportOpenGLES3(bool* half_float_supported) {
int major = 0, minor = 0;
getOpenGLESVersion(major, minor);
LOG(INFO) << "GL_VERSION: OpenGL ES " << major << "." << minor;
if (major < 3) {
LOG(ERROR) << "OpenGL ES 3.0 not supported";
return false;
}
if (!checkOpenGLExtensions("GL_EXT_color_buffer_half_float")) {
LOG(ERROR) << "GL_EXT_color_buffer_half_float is not available";
if (half_float_supported) {
*half_float_supported = false;
}
}
return true;
}
#if CAFFE2_IOS
int iPhoneVersion() {
static int version = 0;
static std::once_flag once;
std::call_once(once, [&]() {
struct utsname systemInfo;
uname(&systemInfo);
std::string iphone_ver_str = systemInfo.machine;
LOG(INFO) << systemInfo.machine;
if (iphone_ver_str.find("iPhone") != std::string::npos) {
std::regex regStr("([0-9]+)");
std::smatch matchs;
if (std::regex_search(iphone_ver_str, matchs, regStr)) {
version = stoi(matchs[0]);
}
}
});
return version;
}
#endif
#if CAFFE2_ANDROID
// whitelist of supported GPUs
bool isSupportedRenderer() {
static std::unordered_set<std::string> supported_renderers = {
"Adreno (TM) 540",
"Adreno (TM) 530",
"Adreno (TM) 510",
"Adreno (TM) 430",
"Adreno (TM) 418",
"Mali-G71",
"Mali-T880",
"NVIDIA Tegra"};
std::string rendererStr((const char*)glGetString(GL_RENDERER));
LOG(INFO) << "GL_RENDERER: " << rendererStr;
int start = rendererStr.find_first_not_of(" ");
int end = rendererStr.find_last_not_of(" ");
rendererStr = rendererStr.substr(start, end - start + 1);
return supported_renderers.count(rendererStr) > 0;
}
#endif
bool isSupportedDevice() {
#if CAFFE2_IOS
return iPhoneVersion() >= 7; // iPhone 6 and up
#elif CAFFE2_ANDROID
return isSupportedRenderer();
#else
return false;
#endif
}

View File

@ -1,46 +0,0 @@
#pragma once
#include "GLTexture.h"
#include "caffe2/core/common.h"
#include <functional>
class GLContext {
private:
static std::unique_ptr<GLContext> _glcontext;
std::function<const GLTexture*(const int width, const int height)> foreignTextureAllocator =
nullptr;
protected:
bool half_float_supported = true;
public:
virtual void set_context() = 0;
virtual void reset_context() = 0;
virtual void flush_context() = 0;
virtual ~GLContext(){};
static void initGLContext();
static GLContext* getGLContext();
static void deleteGLContext();
static bool GL_EXT_texture_border_clamp_defined();
inline bool halfFloatTextureSupported() { return half_float_supported; }
void setTextureAllocator(
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
foreignTextureAllocator = textureAllocator;
}
std::function<const GLTexture*(const int width, const int height)> getTextureAllocator() {
return foreignTextureAllocator;
}
};
bool supportOpenGLES3(bool* hfs = nullptr);
bool isSupportedDevice();
#if CAFFE2_IOS
int iPhoneVersion();
#endif

View File

@ -1,567 +0,0 @@
#include "GLFilter.h"
#include <sstream>
GLFilter::GLFilter(const std::string _kernel_name,
const std::string _vertex_shader,
const std::string _fragment_shader,
const std::vector<binding*> uniforms,
const std::vector<binding*> uniform_blocks,
const std::vector<binding*> attributes,
const replacements_t& _replacements)
: kernel_name(_kernel_name),
uniforms_(uniforms),
uniform_blocks_(uniform_blocks),
attributes_(attributes) {
// shader program
if (createProgram(_vertex_shader.c_str(),
process_replacements(_fragment_shader, _replacements).c_str(),
&program)) {
gl_log(GL_VERBOSE, "created program %d\n", program);
} else {
releaseBuffers();
throwRuntimeError(
[&](std::stringstream& errmsg) { errmsg << "Problem initializing OpenGL program"; });
}
}
const char* shader_utils = R"GLSL(
#define unpackHalf4x16(pd) vec4(unpackHalf2x16(pd.x), unpackHalf2x16(pd.y))
#define packHalf4x16(pd) uvec2(packHalf2x16(pd.xy), packHalf2x16(pd.zw))
)GLSL";
const char* half_float_texture_utils = R"GLSL(
precision mediump sampler2D;
#define TEXTURE_OUTPUT(_loc, _var) \
layout(location = _loc) out mediump vec4 _var
#define TEXTURE_INPUT(_var) \
uniform sampler2D _var
#define TEXTURE_LOAD(_input, _coord) \
texelFetch((_input), (_coord), 0)
#define TEXTURE_STORE(_val) \
(_val)
)GLSL";
const char* half_float_compat_texture_utils = R"GLSL(
precision highp usampler2D;
#define TEXTURE_OUTPUT(_loc, _var) \
layout(location = _loc) out highp uvec2 _var
#define TEXTURE_INPUT(_var) \
uniform usampler2D _var
#define TEXTURE_LOAD(_input, _coord) \
unpackHalf4x16(texelFetch((_input), (_coord), 0).xy)
#define TEXTURE_STORE(_val) \
(uvec2(packHalf4x16((_val))))
)GLSL";
std::string GLFilter::process_replacements(std::string shader,
const replacements_t& replacements) const {
for (auto&& replacement : replacements) {
std::string tag = "$(" + replacement.first + ")";
std::string value = replacement.second;
size_t position = shader.find(tag);
if (position != std::string::npos) {
shader.replace(position, tag.size(), value);
} else {
throwRuntimeError(
[&](std::stringstream& errmsg) { errmsg << "Couldn't find replacement tag: " << tag; });
}
}
// Add some #defines for convenience
std::string version_tag = "#version 300 es";
if (GLContext::getGLContext()->halfFloatTextureSupported()) {
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
} else {
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
}
shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
return shader;
}
template <typename T>
void GLFilter::attach_uniform_buffer(const binding* block,
GLuint bindingPoint,
std::function<void(T*, size_t)> loader) {
if (block->location >= 0) {
if (bindingPoint < kMaxUniformBlocks) {
if (uniformBlock[bindingPoint] == 0) {
// Associate the uniform block index with a binding point
glUniformBlockBinding(program, block->location, bindingPoint);
// Get the size of block
glGetActiveUniformBlockiv(program, block->location, GL_UNIFORM_BLOCK_DATA_SIZE, &blockSize[bindingPoint]);
// Create and fill a buffer object
glGenBuffers(1, &uniformBlock[bindingPoint]);
gl_log(GL_VERBOSE, "created uniform buffer block %d\n", uniformBlock[bindingPoint]);
}
// Fill a buffer object
glBindBuffer(GL_UNIFORM_BUFFER, uniformBlock[bindingPoint]);
glBufferData(GL_UNIFORM_BUFFER, blockSize[bindingPoint], NULL, GL_DYNAMIC_DRAW);
checkGLError([&](std::stringstream& errmsg) {
errmsg << "Unable to bind uniform buffer " << block->name << ":" << block->location
<< " at binding point " << bindingPoint;
});
T* blockData = (T*)glMapBufferRange(
GL_UNIFORM_BUFFER, 0, blockSize[bindingPoint], GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
if (blockData != NULL) {
// Copy the data into the mapped buffer
if (loader)
loader(blockData, blockSize[bindingPoint]);
// Unmap the buffer
if (glUnmapBuffer(GL_UNIFORM_BUFFER) == GL_TRUE) {
// Bind the buffer object to the uniform block binding point
glBindBufferBase(GL_UNIFORM_BUFFER, bindingPoint, uniformBlock[bindingPoint]);
} else {
throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "Error unmapping element buffer object"; });
}
} else {
throwRuntimeError([&](std::stringstream& errmsg) {
errmsg << "Error mapping element buffer object, blockSize: " << blockSize;
});
}
glBindBuffer(GL_UNIFORM_BUFFER, 0);
} else {
throwRuntimeError([&](std::stringstream& errmsg) {
errmsg << "Uniform block binding point out of range: " << bindingPoint << ", should be < "
<< kMaxUniformBlocks;
});
}
} else {
throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "unbound uniform block"; });
}
}
template void GLFilter::attach_uniform_buffer<float16_t>(const binding* block,
GLuint bindingPoint,
std::function<void(float16_t*, size_t)> loader);
static const GLenum unused_capability[] = {GL_CULL_FACE,
GL_BLEND,
GL_DITHER,
GL_STENCIL_TEST,
GL_DEPTH_TEST,
GL_SCISSOR_TEST,
GL_POLYGON_OFFSET_FILL,
GL_SAMPLE_ALPHA_TO_COVERAGE,
GL_SAMPLE_COVERAGE};
void GLFilter::run(const std::vector<texture_attachment>& input,
const std::vector<const GLTexture*>& output,
std::function<void(void)> uniforms_initializer,
int width,
int height) {
const int first_texture_id = GL_TEXTURE0;
GLint defaultFramebuffer = 0;
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
gl_log(GL_VERBOSE,
"GLFilter::run %s - inputs: %d, outputs: %d, width: %d, height: %d\n",
kernel_name.c_str(),
input.size(),
output.size(),
width,
height);
if (output.size() > 4) {
throwRuntimeError([&](std::stringstream& errmsg) {
errmsg << "Too many output textures: " << output.size() << ", should be <= 4";
});
}
if (frameBuffer == 0) {
// create the frame buffer
glGenFramebuffers(1, &frameBuffer);
gl_log(GL_VERBOSE, "created frame buffer %d\n", frameBuffer);
}
glBindFramebuffer(GL_FRAMEBUFFER, frameBuffer);
checkGLError([&](std::stringstream& errmsg) { errmsg << "glBindFramebuffer"; });
// Set up the output textures
for (int i = 0; i < output.size(); i++) {
GLenum target = output[i]->target();
GLuint texture = output[i]->name();
glBindTexture(target, texture);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, target, texture, 0);
checkGLError([&](std::stringstream& errmsg) {
errmsg << "Unable to connect output texture " << texture << " at color attachment " << i;
});
gl_log(GL_VERBOSE, "connected output texture %d to color attachment %d\n", texture, i);
}
// Bind the output textures to the frame buffer attachments
if (!frame_buffer_initialized) {
const int attachments_number = output.size();
const GLenum attachments[4] = {
GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1, GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3};
glDrawBuffers(attachments_number, attachments);
int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
if (fbs != GL_FRAMEBUFFER_COMPLETE) {
throwRuntimeError(
[&](std::stringstream& errmsg) { errmsg << "Frame buffer incomplete: " << fbs; });
}
frame_buffer_initialized = true;
}
glUseProgram(program);
checkGLError([&](std::stringstream& errmsg) { errmsg << "glUseProgram"; });
// Set up the input textures
GLenum texture_idx = first_texture_id;
for (int i = 0; i < input.size(); i++, texture_idx++) {
if (input[i].uniform->location >= 0) {
GLenum target = input[i].texture->target();
GLuint texture = input[i].texture->name();
glActiveTexture(texture_idx);
glBindTexture(target, texture);
glUniform1i(input[i].uniform->location, texture_idx - GL_TEXTURE0);
checkGLError([&](std::stringstream& errmsg) {
errmsg << ": Unable to attach input texture " << texture << " to uniform "
<< input[i].uniform->name << ":" << input[i].uniform->location << " at index "
<< texture_idx - GL_TEXTURE0;
});
gl_log(GL_VERBOSE,
"connected input texture %d to texture unit %d\n",
texture,
texture_idx - GL_TEXTURE0);
} else {
gl_log(GL_VERBOSE, "something wrong happened when i = %d\n", i);
}
}
// Caller supplied uniforms initializer
if (uniforms_initializer) {
uniforms_initializer();
checkGLError([&](std::stringstream& errmsg) {
errmsg << "errors in the uniforms initializer callback";
});
}
// Validate program
if (check_opengl_errors && !validateProgram(program)) {
throwRuntimeError(
[&](std::stringstream& errmsg) { errmsg << "Couldn't validate OpenGL program"; });
}
glViewport(0, 0, width, height);
// Disable stuff we don't need and make sure that we have all the channels ebabled
for (int i = 0; i < sizeof(unused_capability) / sizeof(GLenum); i++) {
glDisable(unused_capability[i]);
}
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
// glDrawElements should be more efficient, but on iOS glDrawArrays is faster.
const bool useDrawArrays = true;
if (useDrawArrays) {
enum { ATTRIB_VERTEX, ATTRIB_TEXTUREPOSITON, NUM_ATTRIBUTES };
static const GLfloat squareVertices[] = {
-1.0f,
-1.0f, // bottom left
1.0f,
-1.0f, // bottom right
-1.0f,
1.0f, // top left
1.0f,
1.0f, // top right
};
static const float textureVertices[] = {
0.0f,
0.0f, // bottom left
1.0f,
0.0f, // bottom right
0.0f,
1.0f, // top left
1.0f,
1.0f, // top right
};
glBindBuffer(GL_ARRAY_BUFFER, 0);
glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, squareVertices);
glEnableVertexAttribArray(ATTRIB_VERTEX);
checkGLError(
[&](std::stringstream& errmsg) { errmsg << "glEnableVertexAttribArray(ATTRIB_VERTEX)"; });
glVertexAttribPointer(ATTRIB_TEXTUREPOSITON, 2, GL_FLOAT, 0, 0, textureVertices);
glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON);
checkGLError([&](std::stringstream& errmsg) {
errmsg << "glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON)";
});
gl_log(GL_VERBOSE, "Calling glDrawArrays\n");
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawArrays"; });
} else {
// Run the shaders on the output geometry
static const GLfloat vVertices[] = {
-1.0f, -1.0f, 0.0f, // Position 0
0.0f, 0.0f, // TexCoord 0
-1.0f, 1.0f, 0.0f, // Position 1
0.0f, 1.0f, // TexCoord 1
1.0f, 1.0f, 0.0f, // Position 2
1.0f, 1.0f, // TexCoord 2
1.0f, -1.0f, 0.0f, // Position 3
1.0f, 0.0f // TexCoord 3
};
static const GLushort indices[] = {0, 1, 2, 0, 2, 3};
// Load the vertex position
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), vVertices);
// Load the texture coordinate
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), &vVertices[3]);
glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);
gl_log(GL_VERBOSE, "Calling glDrawElements\n");
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, indices);
checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawElements"; });
}
#if CAFFE2_ANDROID
glFlush();
#endif
// Unbind the current texture - Man, this is expensive!
for (int i = texture_idx - 1; i >= first_texture_id; i--) {
gl_log(GL_VERBOSE, "unbinding texture unit %d\n", i - GL_TEXTURE0);
glActiveTexture(i);
glBindTexture(GL_TEXTURE_2D, 0);
checkGLError([&](std::stringstream& errmsg) {
errmsg << "Error unbinding texture unit " << i - GL_TEXTURE0;
});
}
glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
}
void GLFilter::releaseBuffers() {
for (int i = 0; i < kMaxUniformBlocks; i++) {
if (uniformBlock[i]) {
gl_log(GL_VERBOSE, "deleting uniform buffer block %d\n", uniformBlock[i]);
glDeleteBuffers(1, &uniformBlock[i]);
uniformBlock[i] = 0;
}
}
if (frameBuffer) {
gl_log(GL_VERBOSE, "deleting frame buffer %d\n", frameBuffer);
glDeleteFramebuffers(1, &frameBuffer);
frameBuffer = 0;
}
}
void GLFilter::deleteProgram() {
if (program) {
gl_log(GL_VERBOSE, "deleting program %d\n", program);
glDeleteProgram(program);
program = 0;
}
}
void GLFilter::deleteBindings() {
for (binding* uniform : uniforms_) {
delete uniform;
}
for (binding* uniform_block : uniform_blocks_) {
delete uniform_block;
}
for (binding* attribute : attributes_) {
delete attribute;
}
}
// Simple vertex shader setting up the coordinates system
const char* GLFilter::vertex_shader = R"GLSL(#version 300 es
layout(location = 0) in vec4 a_position;
layout(location = 1) in vec2 a_texCoord;
out vec2 v_texCoord;
void main()
{
gl_Position = a_position;
v_texCoord = a_texCoord;
}
)GLSL";
bool GLFilter::createProgram(const GLchar* vertSource,
const GLchar* fragSource,
GLuint* program) const {
GLuint vertShader = 0, fragShader = 0, prog = 0, status = 1;
// Clear the error state. We check error state later in the function and
// want to capture only errors in filter program initialization.
glGetError();
// Create shader program
prog = glCreateProgram();
// Create and compile vertex shader
status *= compileShader(GL_VERTEX_SHADER, 1, &vertSource, &vertShader);
// Create and compile fragment shader
status *= compileShader(GL_FRAGMENT_SHADER, 1, &fragSource, &fragShader);
// Attach vertex shader to program
glAttachShader(prog, vertShader);
// Attach fragment shader to program
glAttachShader(prog, fragShader);
// Bind attribute locations
// This needs to be done prior to linking
for (auto&& attribute : attributes_) {
glBindAttribLocation(prog, attribute->location, attribute->name.c_str());
checkGLError([&](std::stringstream& errmsg) {
errmsg << "Couldn't bind attribute: " << attribute->name << " at location "
<< attribute->location;
});
}
// Link program
status *= linkProgram(prog);
// Get locations of uniforms
if (status) {
for (auto&& uniform : uniforms_) {
uniform->location = glGetUniformLocation(prog, uniform->name.c_str());
checkGLError([&](std::stringstream& errmsg) {
errmsg << "Couldn't resolve uniform: " << uniform->name;
});
}
for (auto&& uniform_block : uniform_blocks_) {
uniform_block->location = glGetUniformBlockIndex(prog, uniform_block->name.c_str());
gl_log(GL_VERBOSE,
"Getting location for uniform block: %s, location: %d\n",
uniform_block->name.c_str(),
uniform_block->location);
checkGLError([&](std::stringstream& errmsg) {
errmsg << "Couldn't resolve uniform block: " << uniform_block->name;
});
}
*program = prog;
}
// Release vertex and fragment shaders
if (vertShader) {
glDetachShader(prog, vertShader);
glDeleteShader(vertShader);
}
if (fragShader) {
glDetachShader(prog, fragShader);
glDeleteShader(fragShader);
}
return status == 1;
}
#include <stdlib.h>
/* Compile a shader from the provided source(s) */
GLint GLFilter::compileShader(GLenum target,
GLsizei count,
const GLchar** sources,
GLuint* shader) const {
GLint status = 1;
*shader = glCreateShader(target);
glShaderSource(*shader, count, sources, NULL);
glCompileShader(*shader);
GLint logLength = 0;
glGetShaderiv(*shader, GL_INFO_LOG_LENGTH, &logLength);
if (logLength > 0) {
std::vector<GLchar> log(logLength);
glGetShaderInfoLog(*shader, logLength, &logLength, &log[0]);
gl_log(GL_ERR, "Shader compile log:\n%s", &log[0]);
}
glGetShaderiv(*shader, GL_COMPILE_STATUS, &status);
if (status == 0) {
int i;
gl_log(GL_ERR, "Failed to compile shader:\n");
for (i = 0; i < count; i++)
gl_log(GL_ERR, "%s", sources[i]);
}
return status;
}
/* Link a program with all currently attached shaders */
GLint GLFilter::linkProgram(GLuint program) const {
GLint status = 1;
glLinkProgram(program);
GLint logLength = 0;
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
if (logLength > 0) {
std::vector<GLchar> log(logLength);
glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
gl_log(GL_ERR, "Program link log:\n%s", &log[0]);
}
glGetProgramiv(program, GL_LINK_STATUS, &status);
if (status == 0)
gl_log(GL_ERR, "Failed to link program %d\n", program);
return status;
}
/* Validate a program (for i.e. inconsistent samplers) */
GLint GLFilter::validateProgram(GLuint program) const {
GLint status = 1;
glValidateProgram(program);
GLint logLength = 0;
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
if (logLength > 0) {
std::vector<GLchar> log(logLength);
glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
gl_log(GL_ERR, "Program validate log:\n%s", &log[0]);
}
glGetProgramiv(program, GL_VALIDATE_STATUS, &status);
if (status == 0)
gl_log(GL_ERR, "Failed to validate program %d\n", program);
return status;
}

View File

@ -1,104 +0,0 @@
#pragma once
#include "GLContext.h"
#include "GLTexture.h"
#include "arm_neon_support.h"
#include <functional>
#include <string>
#include <vector>
#define BINDING(variableName) (variableName = new binding{#variableName})
#define ATTRIBUTE(variableName, value) (variableName = new binding{#variableName, value})
class GLFilter {
protected:
const std::string kernel_name;
GLuint program = 0;
GLuint frameBuffer = 0;
static constexpr int kMaxUniformBlocks = 12;
GLuint uniformBlock[kMaxUniformBlocks] = {0};
GLint blockSize[kMaxUniformBlocks] = {0};
bool frame_buffer_initialized = false;
// glGetError() can be expensive, we should turn error checking off when we're done with debugging
static constexpr bool check_opengl_errors = true;
public:
typedef std::vector<std::pair<std::string, std::string>> replacements_t;
struct binding {
const std::string name;
GLint location;
};
struct texture_attachment {
const GLTexture* texture;
const binding* uniform;
};
GLFilter(const std::string kernel_name,
const std::string vertex_shader,
const std::string fragment_shader,
const std::vector<binding*> uniforms,
const std::vector<binding*> uniform_blocks = {},
const std::vector<binding*> attributes = {},
const replacements_t& replacements = {});
// TODO: The set and reset context need to be commented out for unit testing
~GLFilter() {
releaseBuffers();
deleteProgram();
deleteBindings();
}
void throwRuntimeError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
std::stringstream errmsg;
errmsg << kernel_name << ": ";
error_formatter(errmsg);
throw std::runtime_error(errmsg.str());
}
void checkGLError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
if (check_opengl_errors) {
GLenum glError = glGetError();
if (glError != GL_NO_ERROR) {
throwRuntimeError([&](std::stringstream& errmsg) {
error_formatter(errmsg);
errmsg << ", " << glError;
});
}
}
}
template <typename T>
void attach_uniform_buffer(const binding* block,
GLuint bindingPoint, std::function<void(T*, size_t)> loader);
void run(const std::vector<texture_attachment>& input,
const std::vector<const GLTexture*>& output,
std::function<void(void)> uniforms_initializer,
int width,
int height);
void releaseBuffers();
void deleteProgram();
void deleteBindings();
static const char* vertex_shader;
private:
const std::vector<binding*> uniforms_;
const std::vector<binding*> uniform_blocks_;
const std::vector<binding*> attributes_;
std::string process_replacements(std::string source, const replacements_t& replacements) const;
bool createProgram(const GLchar* vertSource, const GLchar* fragSource, GLuint* program) const;
GLint compileShader(GLenum target, GLsizei count, const GLchar** sources, GLuint* shader) const;
GLint linkProgram(GLuint program) const;
GLint validateProgram(GLuint program) const;
};

View File

@ -1,15 +0,0 @@
#include "GLImage.h"
#include "arm_neon_support.h"
#include <c10/util/typeid.h>
namespace caffe2 {
CAFFE_KNOWN_TYPE(GLImage<float>);
CAFFE_KNOWN_TYPE(GLImage<uint8_t>);
CAFFE_KNOWN_TYPE(GLImageVector<float>);
CAFFE_KNOWN_TYPE(GLImageVector<uint8_t>);
#ifdef __ARM_NEON__
CAFFE_KNOWN_TYPE(GLImage<float16_t>);
CAFFE_KNOWN_TYPE(GLImageVector<float16_t>);
#endif
} // namespace caffe2

View File

@ -1,151 +0,0 @@
#pragma once
#include "GLTexture.h"
#include "caffe2/core/logging.h"
#include <functional>
#include <vector>
template <typename T>
class GLImage {
public:
const int width;
const int height;
const int channels;
const int data_size;
const int tile_x;
const int tile_y;
const int texture_width;
const int texture_height;
const int slices;
const std::vector<const GLTexture*> textures;
constexpr static int slice_channels = 4;
static constexpr int channels_to_slices(int channels, int tile_x, int tile_y) {
return ((channels + slice_channels - 1) / slice_channels + tile_x * tile_y - 1) /
(tile_x * tile_y);
}
static const std::vector<const GLTexture*> allocate_textures(
int slices, std::function<const GLTexture*(int slice)> texture_loader) {
std::vector<const GLTexture*> textures;
for (int i = 0; i < slices; i++) {
textures.push_back(texture_loader(i));
}
return textures;
}
GLImage(int _width,
int _height,
int _channels,
int _tile_x,
int _tile_y,
std::function<const GLTexture*(int slice)> texture_loader)
: width(_width),
height(_height),
channels(_channels),
data_size(sizeof(T)),
tile_x(_tile_x),
tile_y(_tile_y),
texture_width(_width * _tile_x),
texture_height(_height * _tile_y),
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
textures(allocate_textures(slices, texture_loader)) {
CAFFE_ENFORCE_EQ(
slices, ((channels + 3) / 4 + tile_x * tile_y - 1) / (tile_x * tile_y));
}
GLImage(int _width,
int _height,
int _channels,
int _tile_x,
int _tile_y,
bool _destroy,
std::function<const GLTexture*(int slice)> texture_loader)
: width(_width),
height(_height),
channels(_channels),
data_size(sizeof(T)),
tile_x(_tile_x),
tile_y(_tile_y),
texture_width(_width * _tile_x),
texture_height(_height * _tile_y),
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
textures(allocate_textures(slices, texture_loader)) {
CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
}
GLImage()
: width(0),
height(0),
channels(0),
data_size(sizeof(T)),
tile_x(0),
tile_y(0),
texture_width(0),
texture_height(0),
slices(0){};
virtual ~GLImage() {
gl_log(GL_VERBOSE, "deleting GLImage\n");
for (auto&& texture : textures) {
delete texture;
}
}
};
template <typename T>
class GLImageVector {
private:
std::vector<GLImage<T>*> images_;
int num_images_ = 0;
int width_ = 0;
int height_ = 0;
int channels_ = 0;
int tile_x_ = 0;
int tile_y_ = 0;
public:
GLImage<T>* operator[](int index) const {
CAFFE_ENFORCE_LT(index, num_images_, "Out of bounds when accessing GLImageVector");
return images_[index];
}
void push_back(GLImage<T>* image) {
CAFFE_ENFORCE_EQ(image->channels, channels_);
CAFFE_ENFORCE_EQ(image->width, width_);
CAFFE_ENFORCE_EQ(image->height, height_);
CAFFE_ENFORCE_EQ(image->tile_x, tile_x_);
CAFFE_ENFORCE_EQ(image->tile_y, tile_y_);
images_.push_back(image);
CAFFE_ENFORCE_LE(images_.size(), num_images_);
}
int size() const { return images_.size(); }
int channels() const { return channels_; }
int width() const { return width_; }
int height() const { return height_; }
int tile_x() const { return tile_x_; }
int tile_y() const { return tile_y_; }
int slices() const { return size() > 0 ? images_[0]->slices : 0; }
GLImageVector(int num_images, int width, int height, int channels, int tile_x = 1, int tile_y = 1)
: num_images_(num_images),
width_(width),
height_(height),
channels_(channels),
tile_x_(tile_x),
tile_y_(tile_y) {}
GLImageVector() {}
~GLImageVector() {
for (int i = 0; i < images_.size(); i++) {
delete images_[i];
}
}
};

View File

@ -1,66 +0,0 @@
#include "GLImageAllocator.h"
#include "arm_neon_support.h"
template <class T>
GLImageVector<T>* GLImageAllocator<T>::newImage(
int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output) {
GLImageVector<T>* images =
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
for (int i = 0; i < num_images; i++) {
images->push_back(
new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
bool usePadding = is_output;
return new GLPlainTexture(type, nullptr, width * tile_x, height * tile_y, usePadding);
}));
}
return images;
}
template <class T>
GLImageVector<T>* GLImageAllocator<T>::newImage(
int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y,
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
GLImageVector<T>* images =
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
for (int i = 0; i < num_images; i++) {
images->push_back(
new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
return textureAllocator(width, height);
}));
}
return images;
}
template <class T>
GLImageVector<T>* GLImageAllocator<T>::ShareTexture(const GLuint textureID,
int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y) {
GLImageVector<T>* images =
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
for (int i = 0; i < num_images; i++) {
images->push_back(
new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
return new GLPlainTexture(
GLImageAllocator<T>::type, textureID, width * tile_x, height * tile_y);
}));
}
return images;
}
template <>
const GLTexture::Type& GLImageAllocator<float16_t>::type = GLTexture::FP16;
template <>
const GLTexture::Type& GLImageAllocator<uint8_t>::type = GLTexture::UI8;
template class GLImageAllocator<float16_t>;
template class GLImageAllocator<uint8_t>;

View File

@ -1,37 +0,0 @@
#pragma once
#include "GLImage.h"
#include "GLPlainTexture.h"
template <class T>
class GLImageAllocator {
public:
static const GLTexture::Type& type;
GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
virtual ~GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
virtual GLImageVector<T>* newImage(
int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output);
virtual GLImageVector<T>* newImage(
int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y,
std::function<const GLTexture*(const int width, const int height)> textureAllocator);
virtual GLImageVector<T>* ShareTexture(const GLuint textureID,
int num_images,
int width,
int height,
int channels,
int tile_x = 1,
int tile_y = 1);
static GLImageAllocator<T>* newGLImageAllocator();
};

View File

@ -1,20 +0,0 @@
#pragma once
#include <stdarg.h>
#include <stdio.h>
enum { GL_ERR = -1, GL_LOG = 0, GL_VERBOSE = 1 };
static constexpr int GL_LOG_LEVEL = GL_LOG;
static inline int gl_log(int level, const char* format, ...) {
int r = 0;
if (level <= GL_LOG_LEVEL) {
va_list args;
va_start(args, format);
r = vfprintf(stderr, format, args);
va_end(args);
}
return r;
}

View File

@ -1,93 +0,0 @@
#include "GLPBO.h"
#include "caffe2/core/logging.h"
GLPBO::~GLPBO() {
if (pboId != 0) {
gl_log(GL_LOG, "deleting PBO buffer %d\n", pboId);
glDeleteBuffers(1, &pboId);
pboId = 0;
}
if (pboFrameBuffer != 0) {
gl_log(GL_LOG, "deleting PBO frame buffer %d\n", pboFrameBuffer);
glDeleteFramebuffers(1, &pboFrameBuffer);
pboFrameBuffer = 0;
}
}
GLPBO* GLPBO::pboContext = NULL;
GLPBO* GLPBO::getContext() {
if (pboContext == NULL) {
pboContext = new GLPBO();
}
return pboContext;
}
void GLPBO::mapTextureData(GLuint _textureId,
GLsizei _width,
GLsizei _height,
GLsizei _stride,
GLsizei _channels,
const GLTexture::Type& _type,
std::function<void(const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const GLTexture::Type& type)> process) {
GLint defaultFramebuffer = 0;
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
if (pboFrameBuffer == 0) {
glGenFramebuffers(1, &pboFrameBuffer);
gl_log(GL_VERBOSE, "created PBO frame buffer %d\n", pboFrameBuffer);
}
glBindFramebuffer(GL_FRAMEBUFFER, pboFrameBuffer);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _textureId, 0);
int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
if (fbs != GL_FRAMEBUFFER_COMPLETE) {
std::stringstream errmsg;
errmsg << ": Frame buffer incomplete: " << fbs;
throw std::runtime_error(errmsg.str());
}
if (pboId == 0) {
glGenBuffers(1, &pboId);
gl_log(GL_VERBOSE, "created PBO buffer %d\n", pboId);
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, pboId);
size_t buffer_size = _stride * _height * _channels * _type.dataSize();
if (buffer_size > pboSize) {
LOG(INFO) << "Allocating PBO of capacity " << buffer_size;
glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, NULL, GL_DYNAMIC_READ);
pboSize = buffer_size;
}
glReadBuffer(GL_COLOR_ATTACHMENT0);
glReadPixels(0, 0, _stride, _height, _type.format, _type.type, 0);
GLhalf* ptr = (GLhalf*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, buffer_size, GL_MAP_READ_BIT);
if (ptr) {
process(ptr, _width, _height, _stride, _channels, _type);
} else {
std::stringstream errmsg;
errmsg << ": glMapBufferRange using PBO incomplete";
throw std::runtime_error(errmsg.str());
}
// Unmap buffer
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
// Bind to the default FrameBuffer
glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
}

View File

@ -1,31 +0,0 @@
#pragma once
#include "GLTexture.h"
#include <functional>
class GLPBO {
GLuint pboId = 0;
GLuint pboSize = 0;
GLuint pboFrameBuffer = 0;
~GLPBO();
static GLPBO* pboContext;
public:
void mapTextureData(GLuint _textureId,
GLsizei _width,
GLsizei _height,
GLsizei _stride,
GLsizei _channels,
const GLTexture::Type& type,
std::function<void(const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const GLTexture::Type& type)> process);
static GLPBO* getContext();
};

View File

@ -1,58 +0,0 @@
#include "GLPlainTexture.h"
#include "GLPBO.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/timer.h"
#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
GLPlainTexture::GLPlainTexture(
const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
// caffe2::Timer timer;
// timer.Start();
glGenTextures(1, &_textureId);
glBindTexture(GL_TEXTURE_2D, _textureId);
glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
gl_log(
GL_VERBOSE,
"GLPlainTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
_textureId,
_type.internalFormat,
_type.format,
_type.type);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, _filter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, _filter);
#if GL_EXT_texture_border_clamp
GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
// Set the texture to use the border clamp wrapping mode.
_wrap = GL_CLAMP_TO_BORDER_EXT;
#endif
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, _wrap);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, _wrap);
glBindTexture(GL_TEXTURE_2D, 0);
// LOG(INFO) << "glTexImage2D takes " << timer.MilliSeconds() << " ms";
}
GLPlainTexture::GLPlainTexture(
const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
_textureId = textureID;
isOwner = false;
gl_log(
GL_VERBOSE,
"GLPlainTexture() - wrapped textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
_textureId,
_type.internalFormat,
_type.format,
_type.type);
}

View File

@ -1,44 +0,0 @@
#pragma once
#include "GLContext.h"
#include "GLTexture.h"
class GLPlainTexture : public GLTexture {
private:
bool isOwner = true;
public:
GLPlainTexture(const Type& type,
const void* input,
GLsizei width,
GLsizei height,
bool use_padding = false,
GLint filter = GL_NEAREST,
GLint wrap = GL_CLAMP_TO_EDGE);
GLPlainTexture(const Type& type,
const GLuint textureID,
GLsizei width,
GLsizei height,
bool use_padding = false,
GLint filter = GL_NEAREST,
GLint wrap = GL_CLAMP_TO_EDGE);
~GLPlainTexture() {
if (glIsTexture(_textureId)) {
if (isOwner) {
gl_log(GL_VERBOSE, "~GLPlainTexture() - deleting texture %d\n", _textureId);
glDeleteTextures(1, &_textureId);
}
} else {
gl_log(GL_ERR, "not deleting texture %d\n", _textureId);
}
}
GLuint name() const { return _textureId; };
GLenum target() const { return GL_TEXTURE_2D; };
bool flipped() const { return false; };
};

View File

@ -1,63 +0,0 @@
#include "GLPredictor.h"
#include "GLContext.h"
#include "rewrite_net.h"
#include <vector>
namespace caffe2 {
template <class T>
void shareInputGLImage(Workspace* ws, const std::string& name, GLImageVector<T>* input) {
auto* blob = ws->GetBlob(name);
CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
blob->ShareExternal<GLImageVector<T>>(input);
}
template <class T>
const GLImageVector<T>* extractOutputGLImage(Workspace* ws, const std::string& name) {
auto* blob = ws->GetBlob(name);
CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
return &blob->template Get<GLImageVector<T>>();
}
const NetDef create_gl_run_net(const NetDef& init_net,
const NetDef& run_net,
bool use_texture_input) {
NetDef gl_run_net;
if (!tryConvertToOpenGL(init_net, run_net, &gl_run_net, use_texture_input)) {
CAFFE_THROW("Failed to convert model to OpenGL");
}
return gl_run_net;
}
GLPredictor::GLPredictor(const NetDef& init_net,
const NetDef& run_net,
bool use_texture_input,
Workspace* parent)
: Predictor(init_net, create_gl_run_net(init_net, run_net, use_texture_input), parent) {}
GLPredictor::~GLPredictor() {}
template <class T>
bool GLPredictor::run(std::vector<GLImageVector<T>*>& inputs,
std::vector<const GLImageVector<T>*>* outputs) {
const NetDef& run_net_ = Predictor::def();
CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
for (auto i = 0; i < inputs.size(); ++i) {
shareInputGLImage<T>(Predictor::ws(), run_net_.external_input(i), inputs[i]);
}
if (!Predictor::ws()->RunNet(run_net_.name())) {
return false;
}
for (auto i = 0; i < run_net_.external_output_size(); ++i) {
outputs->push_back(extractOutputGLImage<T>(Predictor::ws(), run_net_.external_output(i)));
}
return true;
}
template bool GLPredictor::run(std::vector<GLImageVector<uint8_t>*>& inputs,
std::vector<const GLImageVector<uint8_t>*>* outputs);
} // namespace caffe2

View File

@ -1,21 +0,0 @@
#pragma once
#include "GLImage.h"
#include "caffe2/core/net.h"
#include "caffe2/predictor/predictor.h"
namespace caffe2 {
class GLPredictor : public Predictor {
public:
GLPredictor(const NetDef& init_net,
const NetDef& run_net,
bool use_texture_input = false,
Workspace* parent = nullptr);
template <class T>
bool run(std::vector<GLImageVector<T>*>& inputs, std::vector<const GLImageVector<T>*>* outputs);
~GLPredictor();
};
} // namespace caffe2

View File

@ -1,71 +0,0 @@
#include "GLTexture.h"
#include "DataTransfer.h"
#include "GLPBO.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/timer.h"
#if CAFFE2_ANDROID && defined(__ARM_NEON__)
#include "../android/AndroidGLContext.h"
// https://community.arm.com/thread/10002
void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz) {
if (sz & 63) {
sz = (sz & -64) + 64;
}
asm volatile(
"NEONCopyPLD: \n"
" VLDM %[src]!,{d0-d7} \n"
" VSTM %[dst]!,{d0-d7} \n"
" SUBS %[sz],%[sz],#0x40 \n"
" BGT NEONCopyPLD \n"
: [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
:
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
}
#endif
const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
void GLTexture::map_read(std::function<void(const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const {
GLPBO* pbo = GLPBO::getContext();
pbo->mapTextureData(_textureId, _width, _height, _stride, _channels, _type, process);
}
void GLTexture::map_load(std::function<void(void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const {
const int alignment = 32; // 4 * _type.dataSize();
void* buffer = nullptr;
size_t buffer_size = _width * _height * _channels * _type.dataSize();
#ifdef __ANDROID__
buffer = (void*)memalign(alignment, buffer_size);
#else
posix_memalign((void**)&buffer, alignment, buffer_size);
#endif
CAFFE_ENFORCE(buffer);
process(buffer, _width, _height, _width, _channels, _type);
loadData(buffer);
free(buffer);
}
void GLTexture::loadData(const void* pixels) const {
glBindTexture(GL_TEXTURE_2D, _textureId);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, _width, _height, _type.format, _type.type, pixels);
glBindTexture(GL_TEXTURE_2D, 0);
}

View File

@ -1,105 +0,0 @@
#pragma once
#include "GL.h"
#include "GLLogging.h"
class GLTexture {
public:
struct Type {
const GLenum internalFormat;
const GLenum format;
const GLenum type;
int dataSize() const {
switch (type) {
case GL_UNSIGNED_INT:
return 4;
case GL_HALF_FLOAT:
return 2;
case GL_UNSIGNED_BYTE:
return 1;
default:
throw std::runtime_error("Unknown Texture Type");
}
}
int channels() const {
switch (format) {
case GL_R8:
return 1;
case GL_RG8:
return 2;
// case GL_BGRA:
case GL_RG_INTEGER:
case GL_RGBA:
return 4;
default:
throw std::runtime_error("Unknown Texture Format");
}
}
};
static const Type FP16;
static const Type FP16_COMPAT;
static const Type UI8;
protected:
const Type& _type;
const GLsizei _width;
const GLsizei _height;
const GLsizei _stride;
const GLsizei _channels;
const bool _use_padding;
GLint _filter;
GLint _wrap;
GLuint _textureId;
public:
GLTexture(const Type& type,
int width,
int height,
int stride,
bool use_padding,
GLint filter,
GLint wrap)
: _type(type),
_width(width),
_height(height),
_stride(stride),
_channels(type.channels()),
_use_padding(use_padding),
_filter(filter),
_wrap(wrap) {}
GLTexture(const Type& type, int width, int height, bool use_padding, GLint filter, GLint wrap)
: GLTexture(type,
width,
height,
use_padding ? (width + 7) / 8 * 8 : width,
use_padding,
filter,
wrap) {}
virtual ~GLTexture() {}
virtual GLuint name() const = 0;
virtual GLenum target() const = 0;
virtual bool flipped() const = 0;
virtual void map_read(std::function<void(const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const;
virtual void map_load(std::function<void(void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const;
void loadData(const void* pixels) const;
};

View File

@ -1,47 +0,0 @@
#pragma once
#include "GLImageAllocator.h"
namespace caffe2 {
template <class T>
class ImageAllocator {
GLImageAllocator<T>* glImageAllocator;
public:
ImageAllocator() : glImageAllocator(GLImageAllocator<T>::newGLImageAllocator()) {}
virtual ~ImageAllocator() { delete glImageAllocator; }
GLImageVector<T>* newImage(
int num_images, int width, int height, int channels, bool is_output = false) {
const int tile_x = 1, tile_y = 1;
return glImageAllocator->newImage(
num_images, width, height, channels, tile_x, tile_y, is_output);
}
GLImageVector<T>* newImage(int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y,
bool is_output = false) {
return glImageAllocator->newImage(
num_images, width, height, channels, tile_x, tile_y, is_output);
}
GLImageVector<T>* newImage(
int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y,
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
return glImageAllocator->newImage(
num_images, width, height, channels, tile_x, tile_y, textureAllocator);
}
};
} // namespace caffe2

View File

@ -1,12 +0,0 @@
#pragma once
#include "caffe2/core/common.h"
#ifdef __ARM_NEON__
#if CAFFE2_IOS
#include "arm_neon.h"
#elif CAFFE2_ANDROID
#include "caffe2/mobile/contrib/opengl/android/arm_neon_support.h"
#endif
#endif

View File

@ -1,367 +0,0 @@
#include "rewrite_net.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/proto_utils.h"
#include <unordered_map>
#include <unordered_set>
#ifdef CAFFE2_ANDROID
#include "../android/AndroidGLContext.h"
#endif
namespace caffe2 {
struct Analysis {
struct SSA {
using BlobVersions = std::unordered_map<std::string, size_t>;
BlobVersions inVersions;
BlobVersions outVersions;
};
std::vector<SSA> ssa;
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
};
static Analysis analyzeNet(const NetDef& net) {
Analysis::SSA::BlobVersions frontier;
Analysis analysis;
auto play = [&](size_t i, const OperatorDef& op) {
Analysis::SSA::BlobVersions inVersions;
for (const auto& s : op.input()) {
inVersions[s] = frontier[s];
analysis.inUsages[s][frontier[s]].push_back(i);
}
Analysis::SSA::BlobVersions outVersions;
for (const auto& s : op.output()) {
if (frontier.find(s) != frontier.end()) {
frontier[s] += 1;
}
outVersions[s] = frontier[s];
}
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
};
for (auto i = 0; i < net.op_size(); ++i) {
play(i, net.op(i));
}
return analysis;
}
static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
auto* op = predictNet.add_op();
op->set_name("CopyToOpenGL");
op->set_type("CopyToOpenGL");
op->add_input(cpu_blob);
op->add_output(cpu_blob + "_M");
}
static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
// add argument "is_last" to the last op to signal this is the last operator before the
// CopyFromOpenGL op
auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1);
auto* arg = last_op->add_arg();
arg->set_name("is_last");
arg->set_i(1);
auto* op = predictNet.add_op();
op->set_name("CopyFromOpenGL");
op->set_type("CopyFromOpenGL");
op->add_input(cpu_blob + "_M");
op->add_output(cpu_blob);
}
static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) {
// Do some validation of the outputs. For this version, we require:
// - a single input (first element of external_input()) is consumed by the NetDef
// - a single output (first element of external_output()) is produced by the NetDef.
// - the input is consumed by def.op(0), and this is the only consumer.
// - the output is produced by def.op(-1).
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
auto analysis = analyzeNet(def);
// enforce a single use of the input blob.
CAFFE_ENFORCE_GE(def.op_size(), 1);
const auto& inputBlob = def.external_input(0);
// Enforce that the input blob has a single usage - in the first operator.
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
// Enforce that the external_output(0) blob is produced by the last operator in this sequence.
const auto& outputBlob = def.external_output(0);
CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) !=
analysis.ssa.back().outVersions.end());
const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
// This should hold true by definition of the SSA analysis.
CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
analysis.inUsages[outputBlob].end());
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
cpu_blobs[def.external_input(0)].insert(0);
for (auto i = 0; i < def.op_size(); i++) {
const auto& currentOp = def.op(i);
if (glOps.count(currentOp.type()) > 0) {
// OpenGL Op
// insert copyToOpenGLOp
for (auto j = 0; j < currentOp.input_size(); j++) {
auto& input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (cpu_blobs[input].count(version) > 0) {
insertCopyToGPUOp(mdef, input);
gpu_blobs[input].insert(version);
cpu_blobs[input].erase(version);
}
// Only the first input should be OpenGL texture
// Otherwise, copyToOpenGLOp will be inserted for the weights,
// which are outputs of QuantDecode
if (currentOp.type().find("OpenGLConv") == 0) {
if (j == 0) {
break;
}
}
}
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
// swap input blob
for (auto j = 0; j < currentOp.input_size(); j++) {
auto& input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (gpu_blobs[input].count(version) > 0) {
op->set_input(j, input + "_M");
}
}
// swap output blob
for (auto j = 0; j < currentOp.output_size(); j++) {
auto& output = currentOp.output(j);
auto version = analysis.ssa[i].outVersions[output];
op->set_output(j, output + "_M");
gpu_blobs[output].insert(version);
}
// insert copyFromOpenGLOp after the last op if the last op is an OpenGL op
if (i == def.op_size() - 1) {
insertCopyFromGPUOp(mdef, currentOp.output(0));
}
} else {
// CPU Op
// insert copyFromOpenGLOp
for (auto j = 0; j < currentOp.input_size(); j++) {
auto& input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (gpu_blobs[input].count(version) > 0) {
insertCopyFromGPUOp(mdef, input);
}
}
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
for (auto j = 0; j < currentOp.output_size(); j++) {
auto& output = currentOp.output(j);
auto version = analysis.ssa[i].outVersions[output];
cpu_blobs[output].insert(version);
}
}
}
return mdef;
}
static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
const OperatorDef& nextOp,
OperatorDef* fusedOp,
std::unordered_set<std::string>& glOps) {
// Check for possible invalid opportunities.
if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
return false;
}
// The fused op cannot be inplace
if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
return false;
}
static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
{{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"},
{{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"},
{{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"},
{{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}};
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
if (it == fusionOpportunities.end()) {
return false;
}
glOps.insert(it->second);
fusedOp->CopyFrom(currentOp);
fusedOp->set_output(0, nextOp.output(0));
fusedOp->set_type(it->second);
for (auto i = 1; i < nextOp.input_size(); i++) {
fusedOp->add_input(nextOp.input(i));
}
return true;
}
static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
CHECK_GE(def.op_size(), 1);
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
auto i = 0;
while (i < def.op_size()) {
if (i == def.op_size() - 1) {
VLOG(2) << "Last operator, skipping";
auto* op = mdef.add_op();
op->CopyFrom(def.op(i));
i += 1;
continue;
}
const auto& currentOp = def.op(i);
const auto& nextOp = def.op(i + 1);
OperatorDef fusedOp;
if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
// We can fuse.
auto* op = mdef.add_op();
op->CopyFrom(fusedOp);
i += 2;
continue;
}
VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
// Just emit the current type.
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
i += 1;
}
return mdef;
}
void dumpDefForOpenGL(const NetDef& d) {
for (const auto& op : d.op()) {
LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
}
}
// // For debugging
// void dumpDefForOpenGL(const NetDef &net) {
// for (const auto &op : net.op()) {
// printf("***Operator: %s\n", op.type().c_str());
// for (auto input : op.input()) {
// printf("\tInput: %s\n", input.c_str());
// }
//
// for (auto output : op.output()) {
// printf("\tOutput: %s\n", output.c_str());
// }
// }
//}
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
NetDef net;
net.CopyFrom(predictNet);
std::unordered_map<std::string, std::string> replacements(
{{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess",
useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess"
: "OpenGLTensorToTextureStylizerPreprocess"},
{"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess",
useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess"
: "OpenGLTextureToTensorStylizerDeprocess"}});
std::unordered_set<std::string> openGLOps; // Used to insert copy ops
bool needCopyOps = false;
const auto& opKeyList = CPUOperatorRegistry()->Keys();
auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
#ifdef CAFFE2_ANDROID
// TODO: debug InstanceNorm models on Mali devices
AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext();
if (context->get_platform() == Mali) {
opKeySet.erase("OpenGLInstanceNorm");
opKeySet.erase("OpenGLInstanceNormPRelu");
}
#endif
for (auto i = 0; i < net.op_size(); ++i) {
auto* op = net.mutable_op(i);
string openGLOp = std::string("OpenGL") + op->type();
if (replacements.count(openGLOp) > 0) {
openGLOp = replacements[openGLOp];
}
if (opKeySet.find(openGLOp) != opKeySet.end()) {
op->set_type(openGLOp);
openGLOps.insert(openGLOp);
if (useTiling) {
auto* arg = op->add_arg();
arg->set_name("tiling");
arg->set_i(1);
}
} else {
needCopyOps = true;
}
}
if (useTextureInput && needCopyOps) {
CAFFE_THROW("OpenGL operator missing");
}
if (runFusion) {
net = runOpenGLFusion(net, openGLOps);
}
if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
// For end-to-end testing
if (net.op(net.op_size() - 1).type() !=
replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) {
auto* last_op = net.mutable_op(net.op_size() - 1);
auto output = last_op->output(0) + "M";
last_op->set_output(0, output);
auto* copy_op = net.add_op();
copy_op->set_name("CopyFromOpenGL");
copy_op->set_type("CopyFromOpenGL");
copy_op->add_input(output);
// rename output blob in case input and output blob has the same name
copy_op->add_output(net.external_output(0));
}
} else {
if (!useTextureInput) {
needCopyOps = true;
}
}
// copy ops are needed when the input is not a texture
if (needCopyOps) {
// For non style transfer cases
net = insertInputOutputCopyOps(net, openGLOps);
}
return net;
}
bool tryConvertToOpenGL(const NetDef& initNet,
const NetDef& predictNet,
NetDef* glPredictNet,
bool useTextureInput,
bool useTiling,
bool runFusion) {
try {
// Throws if unsupported operators are found.
*glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
dumpDefForOpenGL(*glPredictNet);
// Throws if unsupported parameters are found.
Workspace ws;
ws.RunNetOnce(initNet);
ws.CreateNet(*glPredictNet);
LOG(INFO) << "OpenGL is successfully enabled";
return true;
} catch (const std::exception& e) {
LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
return false;
}
}
} // namespace caffe2

View File

@ -1,20 +0,0 @@
#pragma once
#include "GLPredictor.h"
#include "caffe2/predictor/predictor.h"
namespace caffe2 {
bool tryConvertToOpenGL(const NetDef& initNet,
const NetDef& predictNet,
NetDef* glPredictNet,
bool useTextureInput = false,
bool useTiling = false,
bool runFusion = true);
// Exposed for testing
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
bool useTextureInput = false,
bool useTiling = false,
bool runFusion = true);
void dumpDefForOpenGL(const NetDef& net);
} // namespace caffe2

View File

@ -1,2 +0,0 @@
file(GLOB_RECURSE tmp *.mm *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,19 +0,0 @@
#include "IOSGLContext.h"
std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
void GLContext::initGLContext() {
if (_glcontext == nullptr) {
_glcontext.reset(new IOSGLContext());
}
}
GLContext* GLContext::getGLContext() {
if (_glcontext == nullptr) {
initGLContext();
}
return _glcontext.get();
}
void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }

View File

@ -1,11 +0,0 @@
#include "IOSGLImageAllocator.h"
#include <arm_neon.h>
template <typename T>
GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
return new IOSGLImageAllocator<T>();
}
template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();

View File

@ -1,22 +0,0 @@
#pragma once
#include "../core/GLContext.h"
#include "../core/GLTexture.h"
#import <CoreVideo/CoreVideo.h>
class IOSGLContext : public GLContext {
void* oglContext;
void* oldContext;
CVOpenGLESTextureCacheRef textureCache;
public:
IOSGLContext();
~IOSGLContext();
const GLTexture* createNewTexture(CVPixelBufferRef pixelBuffer, const GLTexture::Type& type);
void set_context();
void reset_context();
void flush_context();
};

View File

@ -1,98 +0,0 @@
#include "IOSGLContext.h"
#include "IOSGLTexture.h"
#import <sstream>
#import <OpenGLES/EAGL.h>
IOSGLContext::IOSGLContext() {
auto const currentContext = [EAGLContext currentContext];
oldContext = (void*)CFBridgingRetain(currentContext);
if (currentContext != nil && [currentContext API] == kEAGLRenderingAPIOpenGLES3) {
oglContext = (void*)CFBridgingRetain(currentContext);
gl_log(GL_LOG, "Reusing current context %p\n", oglContext);
} else {
oglContext =
(void*)CFBridgingRetain([[EAGLContext alloc] initWithAPI:kEAGLRenderingAPIOpenGLES3]);
gl_log(GL_LOG, "Created a new context %p\n", oglContext);
}
if (!oglContext) {
throw std::runtime_error("Problem with OpenGL context");
}
set_context();
textureCache = NULL;
CVReturn err = CVOpenGLESTextureCacheCreate(
kCFAllocatorDefault, NULL, (__bridge EAGLContext*)oglContext, NULL, &textureCache);
if (err) {
std::stringstream errmsg;
errmsg << "Error at CVOpenGLESTextureCacheCreate " << err;
throw std::runtime_error(errmsg.str());
}
}
IOSGLContext::~IOSGLContext() {
gl_log(GL_VERBOSE, "~IOSGLContext()");
set_context();
if (textureCache) {
CFRelease(textureCache);
textureCache = 0;
}
reset_context();
// Explicitly release only after we `reset_context` since otherwise we are going to read from a
// dangling pointer.
if (oglContext) {
CFBridgingRelease(oglContext);
}
if (oldContext) {
CFBridgingRelease(oldContext);
}
}
const GLTexture* IOSGLContext::createNewTexture(CVPixelBufferRef pixelBuffer,
const GLTexture::Type& type) {
return new IOSGLTexture(type, textureCache, pixelBuffer);
}
void IOSGLContext::set_context() {
auto const currentContext = [EAGLContext currentContext];
if ((__bridge void*)currentContext != oglContext) {
if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oglContext]) {
throw std::runtime_error("Problem setting OpenGL context");
}
GLenum glError = glGetError();
if (glError != GL_NO_ERROR) {
gl_log(GL_ERR, "There is an error: 0x%X\n", glError);
}
gl_log(GL_VERBOSE, "Set context to %p\n", oglContext);
}
}
void IOSGLContext::reset_context() {
EAGLContext* currentContext = [EAGLContext currentContext];
if ((__bridge void*)currentContext != oldContext) {
GLenum glError = glGetError();
if (glError != GL_NO_ERROR) {
gl_log(GL_ERR, "There is an error before: 0x%X\n", glError);
}
if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oldContext]) {
throw std::runtime_error("Problem setting OpenGL context");
}
glError = glGetError();
if (glError != GL_NO_ERROR) {
gl_log(GL_ERR, "There is an error after: 0x%X\n", glError);
}
gl_log(GL_VERBOSE, "Reset context to %p\n", oldContext);
}
}
void IOSGLContext::flush_context() { CVOpenGLESTextureCacheFlush(textureCache, 0); }

View File

@ -1,78 +0,0 @@
#include "IOSGLImageAllocator.h"
#include "../core/GLImage.h"
#include "../core/GLImageAllocator.h"
#include "../core/GLPlainTexture.h"
#include "IOSGLContext.h"
#include "IOSGLTexture.h"
#include "../core/arm_neon_support.h"
template <class T>
GLImageVector<T>* IOSGLImageAllocator<T>::newImage(int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y,
bool useCVPixelBuffer) {
GLImageVector<T>* output_images =
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
if (useCVPixelBuffer) {
IOSGLContext* gl_context = (IOSGLContext*)GLContext::getGLContext();
for (int i = 0; i < num_images; i++) {
GLImage<T>* output_image = new GLImage<T>(
width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
gl_log(GL_VERBOSE,
"%s pixelbuffers.size(): %ld\n",
__PRETTY_FUNCTION__,
pixelbuffers.size());
CVPixelBufferRef buffer = NULL;
int slices = (channels + 3) / 4;
int slice_index = i * slices + slice;
if (pixelbuffers.size() < slice_index + 1) {
const int texture_width = width * tile_x;
const int texture_height = height * tile_y;
buffer =
IOSGLTexture::createCVPixelBuffer(pixelFormat, texture_width, texture_height);
gl_log(GL_VERBOSE,
"created a new buffer %p for image %d slice %d of dimensions %dx%d\n",
buffer,
i,
slice,
texture_width,
texture_height);
pixelbuffers.push_back(buffer);
} else {
buffer = pixelbuffers[slice_index];
gl_log(GL_VERBOSE, "reused buffer %p for image %d slice %d\n", buffer, i, slice);
}
return gl_context->createNewTexture(buffer, GLImageAllocator<T>::type);
});
output_images->push_back(output_image);
}
} else {
for (int i = 0; i < num_images; i++) {
GLImage<T>* image = new GLImage<T>(
width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
return new GLPlainTexture(
GLImageAllocator<T>::type, nullptr, width * tile_x, height * tile_y);
});
output_images->push_back(image);
}
}
return output_images;
}
template <>
const FourCharCode IOSGLImageAllocator<float16_t>::pixelFormat = kCVPixelFormatType_64RGBAHalf;
template <>
const FourCharCode IOSGLImageAllocator<uint8_t>::pixelFormat = kCVPixelFormatType_32BGRA;
template class IOSGLImageAllocator<float16_t>;
template class IOSGLImageAllocator<uint8_t>;

View File

@ -1,34 +0,0 @@
#pragma once
#include "../core/GLImageAllocator.h"
#import <CoreVideo/CoreVideo.h>
template <class T>
class IOSGLImageAllocator : public GLImageAllocator<T> {
static const GLTexture::Type& type;
std::vector<CVPixelBufferRef> pixelbuffers;
public:
static const FourCharCode pixelFormat;
IOSGLImageAllocator() : GLImageAllocator<T>() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
~IOSGLImageAllocator() {
gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__);
for (auto&& pixelbuffer : pixelbuffers) {
CFRelease(pixelbuffer);
}
}
GLImageVector<T>* newImage(int num_images,
int width,
int height,
int channels,
int tile_x,
int tile_y,
bool useCVPixelBuffer);
};

View File

@ -1,51 +0,0 @@
#pragma once
#include "../core/GLContext.h"
#include "../core/GLTexture.h"
#import <CoreVideo/CoreVideo.h>
class IOSGLTexture : public GLTexture {
CVOpenGLESTextureRef textureRef;
IOSGLTexture(const Type& type,
CVOpenGLESTextureCacheRef textureCache,
CVPixelBufferRef sourceImage,
GLint _filter = GL_NEAREST,
GLint _wrap = GL_CLAMP_TO_EDGE);
friend class IOSGLContext;
public:
const CVPixelBufferRef sourceImage;
~IOSGLTexture() { CFRelease(textureRef); }
void map_buffer(std::function<void(void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const;
virtual void map_read(std::function<void(const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const;
virtual void map_load(std::function<void(void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const;
GLuint name() const { return CVOpenGLESTextureGetName(textureRef); }
GLenum target() const { return CVOpenGLESTextureGetTarget(textureRef); };
bool flipped() const { return CVOpenGLESTextureIsFlipped(textureRef); };
static CVPixelBufferRef createCVPixelBuffer(OSType pixelType, int32_t width, int32_t height);
};

View File

@ -1,121 +0,0 @@
#include "IOSGLTexture.h"
#include "../core/DataTransfer.h"
IOSGLTexture::IOSGLTexture(const Type& type,
CVOpenGLESTextureCacheRef textureCache,
CVPixelBufferRef _sourceImage,
GLint filter,
GLint wrap)
: GLTexture(type,
CVPixelBufferGetWidth(_sourceImage),
CVPixelBufferGetHeight(_sourceImage),
CVPixelBufferGetBytesPerRow(_sourceImage) / (type.channels() * type.dataSize()),
false,
filter,
wrap),
sourceImage(_sourceImage) {
CVReturn err = CVOpenGLESTextureCacheCreateTextureFromImage(kCFAllocatorDefault,
textureCache,
_sourceImage,
NULL,
GL_TEXTURE_2D,
_type.internalFormat,
_width,
_height,
_type.format,
_type.type,
0,
&textureRef);
if (!textureRef || err) {
gl_log(GL_ERR,
"something went wrong, sourceImage: %p, width: %d, height: %d, filter: %d, wrap: %d\n",
_sourceImage,
_width,
_height,
filter,
wrap);
}
_textureId = name();
gl_log(
GL_VERBOSE,
"IOSGLTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
_textureId,
_type.internalFormat,
_type.format,
_type.type);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, _textureId);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
#if GL_EXT_texture_border_clamp
GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
// Set the texture to use the border clamp wrapping mode.
wrap = GL_CLAMP_TO_BORDER_EXT;
#endif
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, wrap);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, wrap);
glBindTexture(GL_TEXTURE_2D, 0);
}
CVPixelBufferRef IOSGLTexture::createCVPixelBuffer(OSType pixelFormat,
int32_t width,
int32_t height) {
NSDictionary* pixelBufferAttributes = @{
(id)kCVPixelBufferPixelFormatTypeKey : @(pixelFormat),
(id)kCVPixelFormatOpenGLESCompatibility : @YES,
(id)kCVPixelBufferIOSurfacePropertiesKey : @{/*empty dictionary*/}
};
CVPixelBufferRef buffer = NULL;
CVPixelBufferCreate(kCFAllocatorDefault,
width,
height,
pixelFormat,
(__bridge CFDictionaryRef)(pixelBufferAttributes),
&buffer);
return buffer;
}
void IOSGLTexture::map_buffer(std::function<void(void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const {
if (CVPixelBufferLockBaseAddress(sourceImage, 0) == kCVReturnSuccess) {
void* buffer = CVPixelBufferGetBaseAddress(sourceImage);
int buffer_stride = CVPixelBufferGetBytesPerRow(sourceImage) / (_channels * _type.dataSize());
process(buffer, _width, _height, buffer_stride, _channels, _type);
CVPixelBufferUnlockBaseAddress(sourceImage, 0);
}
}
void IOSGLTexture::map_load(std::function<void(void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const {
map_buffer(process);
}
void IOSGLTexture::map_read(std::function<void(const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const Type& type)> process) const {
// TODO: why is glFlush() only necessary when running tests
glFlush();
map_buffer(process);
}

View File

@ -1,2 +0,0 @@
file(GLOB_RECURSE tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,143 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLAdd : public GLFilter {
public:
binding* inputData[2];
binding* outputSize;
GLAdd()
: GLFilter("GLAdd",
vertex_shader,
fragment_shader,
std::vector<binding*>(
{BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
{/* no uniform blocks */},
{/* no attributes */},
{/* no replacements */}) {}
template <typename T>
void add(const GLImageVector<T>& input_image0,
const GLImageVector<T>& input_image1,
const GLImageVector<T>& output_image);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLAdd::fragment_shader = R"GLSL(#version 300 es
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData[2]);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
vec4 value = A + B;
outputData = TEXTURE_STORE(value);
}
)GLSL";
template <typename T>
void GLAdd::add(const GLImageVector<T>& input_images0,
const GLImageVector<T>& input_images1,
const GLImageVector<T>& output_images) {
const int num_images = input_images0.size();
for (int i = 0; i < num_images; i++) {
GLImage<T>* input_image0 = input_images0[i];
GLImage<T>* input_image1 = input_images1[i];
int input_slices = input_image0->slices;
GLImage<T>* output_image = output_images[i];
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments;
input_attachments.push_back({input_image0->textures[is], inputData[0]});
input_attachments.push_back({input_image1->textures[is], inputData[1]});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
output_image->texture_width,
output_image->texture_height);
}
}
}
namespace caffe2 {
template <typename T>
class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLAddOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
"OpenGLAdd does not support broadcast");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
}
bool RunOnDevice() override {
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
CAFFE_ENFORCE_EQ(input0.size(), input1.size());
const int num_images = input0.size();
const int input_channels = input0.channels();
const int input_width = input0.width();
const int input_height = input0.height();
const int input_tile_x = input0.tile_x();
const int input_tile_y = input0.tile_y();
CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
CAFFE_ENFORCE_EQ(input1.width(), input_width);
CAFFE_ENFORCE_EQ(input1.height(), input_height);
CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
const int output_tile_x = input_tile_x;
const int output_tile_y = input_tile_y;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
if (!_add) {
_add.reset(new GLAdd());
}
_add->add(input0, input1, *output);
Outputs()[0]->Reset(output);
return true;
}
private:
std::unique_ptr<GLAdd> _add;
};
REGISTER_CPU_OPERATOR(OpenGLAdd, OpenGLAddOp<float16_t>);
OPERATOR_SCHEMA(OpenGLAdd).NumInputs(2).NumOutputs(1);
} // namespace caffe2

View File

@ -1,202 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "gl_tiling_utils.h"
#include <iostream>
#include <vector>
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "caffe2/utils/math.h"
class GLConcat : public GLFilter {
public:
bool tiling_;
binding* inputData;
binding* outputSize;
binding* inputTileRange;
binding* input_tile_x;
GLConcat(tile_descriptor output_tile_geometries, bool tiling = false)
: GLFilter(
"GLConcat",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize),
BINDING(inputData),
BINDING(inputTileRange),
BINDING(input_tile_x)}),
{/* no uniform blocks */},
{/* no attributes */},
{{"TILING", c10::to_string(tiling)},
{"OUTPUT_TILES", c10::to_string(output_tile_geometries.tiles)},
{"OUTPUT_TILE_X",
c10::to_string(output_tile_geometries.tile_dims.x)},
{"OUTPUT_TILE_WIDTH",
c10::to_string(output_tile_geometries.tile_size.x)},
{"OUTPUT_TILE_HEIGHT",
c10::to_string(output_tile_geometries.tile_size.y)}}),
tiling_(tiling) {}
template <typename T>
void concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_image, int size);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLConcat::fragment_shader = R"GLSL(#version 300 es
#define TILING $(TILING)
// tiling
#define OUTPUT_TILES $(OUTPUT_TILES)
#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
uniform ivec2 outputSize;
uniform ivec2 inputTileRange; // (]
uniform int input_tile_x;
#if TILING
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
if (tileNum >= inputTileRange.x && tileNum < inputTileRange.y) {
tileNum = tileNum - inputTileRange.x;
texelCoord = ivec2(tileNum % input_tile_x, tileNum / input_tile_x) * ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT) + tileCoord;
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(value);
} else {
// early termination
discard;
}
}
#else
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(value);
}
#endif
)GLSL";
template <typename T>
void GLConcat::concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_images, int input_size) {
for (int k = 0; k < output_images.size(); k++) {
GLImage<T>* output_image = output_images[k];
int is = 0, os = 0;
for (int i = 0; i < input_size; i++) {
for (int j = 0; j < input_images[i]->slices(); j++) {
GLImage<T>* input_image = (*input_images[i])[k];
std::vector<texture_attachment> input_attachments;
input_attachments.push_back({input_image->textures[j], inputData});
run(input_attachments,
{output_image->textures.begin() + os, output_image->textures.begin() + os + 1},
[&]() {
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
glUniform2i(inputTileRange->location, is, is + input_image->tile_x * input_image->tile_y);
glUniform1i(input_tile_x->location, input_image->tile_x);
},
output_image->texture_width,
output_image->texture_height);
if (!tiling_) {
os++; // for tiling, you always write to the same texture
}
is += input_image->tile_x * input_image->tile_y;
}
}
}
}
namespace caffe2 {
template <typename T>
class OpenGLConcatOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLConcatOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
}
bool RunOnDevice() override {
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input0.size();
const GLImageVector<T>** input_images = new const GLImageVector<T>*[Inputs().size()];
input_images[0] = &input0;
int channelCount = input0.channels();
bool tiling = OperatorBase::GetSingleArgument<int>("tiling", 0);
// Only supports input channels divisible by 4 for now
CAFFE_ENFORCE_EQ(input0.channels() % 4, 0);
for (auto i = 1; i < Inputs().size(); i++) {
const GLImageVector<T>& inputi = Inputs()[i]->template Get<GLImageVector<T>>();
channelCount += inputi.channels();
CAFFE_ENFORCE_EQ(num_images, inputi.size());
CAFFE_ENFORCE_EQ(inputi.channels() % 4, 0);
CAFFE_ENFORCE_EQ(input0.width(), inputi.width());
CAFFE_ENFORCE_EQ(input0.height(), inputi.height());
input_images[i] = &inputi;
if (inputi.tile_x() > 1 || inputi.tile_y() > 1) {
tiling = true;
}
}
const int input_width = input0.width();
const int input_height = input0.height();
const int output_channels = channelCount;
const int output_width = input_width;
const int output_height = input_height;
int output_tile_x = 1;
int output_tile_y = 1;
if (tiling) {
computeOutputTiles(output_channels, output_tile_x, output_tile_y);
}
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
if (!_concat) {
tile_descriptor output_tile_geometries{
{output_tile_x, output_tile_y}, {output_width, output_height}, output_tile_x * output_tile_y};
_concat.reset(new GLConcat(output_tile_geometries, tiling));
}
_concat->concat(input_images, *output, Inputs().size());
delete[] input_images;
Outputs()[0]->Reset(output);
return true;
}
private:
StorageOrder order_;
std::unique_ptr<GLConcat> _concat;
};
REGISTER_CPU_OPERATOR(OpenGLConcat, OpenGLConcatOp<float16_t>);
OPERATOR_SCHEMA(OpenGLConcat).NumInputs(2, 4).NumOutputs(1, 2);
} // namespace caffe2

File diff suppressed because it is too large Load Diff

View File

@ -1,232 +0,0 @@
#pragma once
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "gl_tiling_utils.h"
class GLConvolution : public GLFilter {
public:
static constexpr int MaxInputBatchSize = 8;
static constexpr int MaxOutputBatchSize = 4;
struct descriptor {
int input_channels;
int output_channels;
point kernel_size;
point input_tile_size;
point output_tile_size;
point input_tile_grid_size;
point output_tile_grid_size;
point input_padding;
point input_stride;
bool transposed;
};
const float* kernel;
const float* bias;
const float* prelu_scale;
binding* inputData[MaxInputBatchSize];
binding* previousData[MaxOutputBatchSize];
binding* outputSize;
binding* accumulate;
binding* fusePRelu;
binding* kernel_block[MaxInputBatchSize];
binding* bias_block;
binding* prelu_scale_block;
binding* inputTileRange;
const descriptor geometry;
const int prelu_scale_size;
const int input_batch_size;
const int output_batch_size;
const int input_tiles;
const int output_tiles;
const int input_tile_chunk_size;
const int output_tile_chunk_size;
const int input_tile_batch_size;
const int output_tile_batch_size;
const bool tiling;
static const char* fragment_shader;
GLConvolution(
const descriptor& _geometry,
const float* _kernel,
const float* _bias,
const float* _prelu_scale = nullptr,
int _prelu_scale_size = 0,
int _input_batch_size = 1,
int _output_batch_size = 1,
int _input_tiles = 1,
int _output_tiles = 1,
int _input_tile_chunk_size = 1,
int _output_tile_chunk_size = 1,
int _input_tile_batch_size = 1,
int _output_tile_batch_size = 1,
bool _tiling = false)
: GLFilter(
"GLConvolution",
vertex_shader,
fragment_shader,
input_bindings(_input_batch_size, _output_batch_size),
uniform_blocks_bindings(
_input_batch_size,
_output_batch_size,
_output_tile_batch_size,
_prelu_scale != nullptr),
{/* no attributes */},
{{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
{"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
{"INPUT_BATCH_SIZE", c10::to_string(_input_batch_size)},
{"OUTPUT_BATCH_SIZE", c10::to_string(_output_batch_size)},
{"INPUT_TILES", c10::to_string(_input_tiles)},
{"OUTPUT_TILES", c10::to_string(_output_tiles)},
{"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
{"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
{"OUTPUT_TILE_WIDTH",
c10::to_string(_geometry.output_tile_size.x)},
{"OUTPUT_TILE_HEIGHT",
c10::to_string(_geometry.output_tile_size.y)},
{"INPUT_TILE_X", c10::to_string(_geometry.input_tile_grid_size.x)},
{"OUTPUT_TILE_X",
c10::to_string(_geometry.output_tile_grid_size.x)},
{"INPUT_TILE_CHUNK_SIZE", c10::to_string(_input_tile_chunk_size)},
{"OUTPUT_TILE_CHUNK_SIZE",
c10::to_string(_output_tile_chunk_size)},
{"OUTPUT_TILE_BATCH_SIZE",
c10::to_string(_output_tile_batch_size)},
{"TILED_CONVOLUTION", c10::to_string(_tiling)},
{"INPUT_PADDING_X",
c10::to_string(
_geometry.transposed
? _geometry.kernel_size.x - 1 - _geometry.input_padding.x
: _geometry.input_padding.x)},
{"INPUT_PADDING_Y",
c10::to_string(
_geometry.transposed
? _geometry.kernel_size.y - 1 - _geometry.input_padding.y
: _geometry.input_padding.y)},
{"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
{"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
{"TRANSPOSED_CONVOLUTION", c10::to_string(_geometry.transposed)},
{"BOUNDS_CHECK_MODE",
c10::to_string(bounds_check_mode(_tiling, _geometry))}}),
kernel(_kernel),
bias(_bias),
prelu_scale(_prelu_scale),
geometry(_geometry),
prelu_scale_size(_prelu_scale_size),
input_batch_size(_input_batch_size),
output_batch_size(_output_batch_size),
input_tiles(_input_tiles),
output_tiles(_output_tiles),
input_tile_chunk_size(_input_tile_chunk_size),
output_tile_chunk_size(_output_tile_chunk_size),
input_tile_batch_size(_input_tile_batch_size),
output_tile_batch_size(_output_tile_batch_size),
tiling(_tiling) {}
~GLConvolution() {}
template <typename T>
void convolution(
const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images);
private:
/*
* Computes BOUNDS_CHECK_MODE for the convolution parameters.
*
* @retval 0 if bounds check can be skipped
* @retval non-zero if bounds check can not be skipped
*/
inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
if (tiling) {
return 1;
}
int input_padding_x = geometry.input_padding.x,
input_padding_y = geometry.input_padding.y;
if (geometry.transposed) {
input_padding_x = geometry.kernel_size.x - 1 - input_padding_x;
input_padding_y = geometry.kernel_size.y - 1 - input_padding_y;
}
if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
(input_padding_x == 0 && input_padding_y == 0)) {
return 0;
} else {
return 1;
}
}
const std::vector<binding*> input_bindings(
int input_batch_size,
int output_batch_size) {
std::vector<binding*> bindings({BINDING(outputSize),
BINDING(accumulate),
BINDING(fusePRelu),
BINDING(inputTileRange)});
for (int i = 0; i < input_batch_size; i++) {
bindings.push_back(
inputData[i] = new binding{"inputData[" + c10::to_string(i) + "]"});
}
for (int i = 0; i < output_batch_size; i++) {
bindings.push_back(
previousData[i] =
new binding{"previousData[" + c10::to_string(i) + "]"});
}
return bindings;
}
const std::vector<binding*> uniform_blocks_bindings(
int input_batch_size,
int output_batch_size,
int output_tile_batch_size,
bool fuse_prelu) {
std::vector<binding*> bindings({BINDING(bias_block)});
if (fuse_prelu) {
bindings.push_back(BINDING(prelu_scale_block));
}
for (int i = 0; i < std::max(input_batch_size, output_tile_batch_size);
i++) {
bindings.push_back(
kernel_block[i] =
new binding{"Kernel_block[" + c10::to_string(i) + "]"});
}
return bindings;
}
void pack_kernel_data_for_bached_conv(
float16_t* data,
size_t size,
int input_channels,
int output_channels,
int is,
int os,
int ib);
void pack_kernel_data_for_tiled_conv(
float16_t* data, // destination
size_t size,
int input_channels,
int output_channels,
point input_tile_range,
point output_tile_range);
template <typename T>
void run_batched_conv(
const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images);
template <typename T>
void run_tiled_conv(
const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images);
};

View File

@ -1,176 +0,0 @@
#include "caffe2/core/common.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "../core/DataTransfer.h"
#include "../core/GLContext.h"
#include "../core/GLImage.h"
#include "../core/GLPlainTexture.h"
#include "../core/ImageAllocator.h"
#include <algorithm>
namespace caffe2 {
template <class T>
class CopyToOpenGLOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
CopyToOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() override {
// caffe2::Timer timer;
const TensorCPU& X = Input(0);
const int num_images = X.dim32(0);
const int input_channels = X.dim32(1);
const int input_width = X.dim32(3);
const int input_height = X.dim32(2);
const int input_size = input_width * input_height;
// set up the OpenGL context
GLContext::getGLContext()->set_context();
const float* input = X.template data<float>();
int tile_x = GetSingleArgument<int>("tile_x", 1);
int tile_y = GetSingleArgument<int>("tile_y", 1);
GLImageVector<T>* output_image = ImageAllocator<T>::newImage(num_images,
input_width,
input_height,
input_channels,
tile_x,
tile_y,
#if CAFFE2_IOS
true
#else
false
#endif
);
if (output_image->tile_x() > 1 || output_image->tile_y() > 1) {
LOG(INFO) << "CopyToOpenGLOp tiling: " << output_image->tile_x() << ":"
<< output_image->tile_y();
}
Outputs()[0]->Reset(output_image);
for (int i = 0; i < num_images; i++) {
const auto textures = (*output_image)[i]->textures;
for (int slice = 0; slice < textures.size(); slice++) {
// timer.Start();
textures[slice]->map_load([&](void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const GLTexture::Type& type) {
for (int y = 0; y < tile_y; y++) {
for (int x = 0; x < tile_x; x++) {
const int tiles = slice * tile_x * tile_y + y * tile_x + x;
const int slice_channels = std::min(4, input_channels - 4 * tiles);
interleaveSlice(
(float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
&input[i * input_channels * input_size + 4 * tiles * input_size],
input_width,
input_height,
stride, // texture stride
slice_channels);
}
}
});
// LOG(INFO) << "Texture uploading takes " << timer.MilliSeconds() << " ms";
}
}
return true;
}
};
REGISTER_CPU_OPERATOR(CopyToOpenGL, CopyToOpenGLOp<float16_t>);
OPERATOR_SCHEMA(CopyToOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
template <class T>
class CopyFromOpenGLOp final : public Operator<CPUContext> {
public:
CopyFromOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() override {
caffe2::Timer timer;
const GLImageVector<T>& X = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = X.size();
const int input_channels = X.channels();
const int input_width = X.width();
const int input_height = X.height();
TensorCPU* Y = Output(0);
Y->Resize(num_images, input_channels, input_height, input_width);
const int output_width = input_width;
const int output_height = input_height;
const int output_size = input_width * input_height;
float* output = Y->mutable_data<float>();
const int tile_x = X.tile_x();
const int tile_y = X.tile_y();
for (int i = 0; i < num_images; i++) {
for (int slice = 0; slice < X[i]->slices; slice++) {
timer.Start();
const GLTexture* texture = X[i]->textures[slice];
texture->map_read([&](const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const GLTexture::Type& type) {
//#if CAFFE2_ANDROID && defined(__ARM_NEON__)
// if (static_cast<AndroidGLContext*>(GLContext::getGLContext())->get_platform() ==
// Mali) {
// caffe2::Timer timer;
// timer.Start();
// float16_t* copy_buffer = (float16_t*)malloc(_capacity);
// arm_memcpy(
// (volatile unsigned char*)copy_buffer, (volatile unsigned char*)buffer,
// _capacity);
// deInterleaveSlice(
// output + 4 * slice * output_size, copy_buffer, width, height, stride,
// slice_channels);
// free(copy_buffer);
// LOG(INFO) << "memcpy takes " << timer.MilliSeconds() << " ms";
// } else
//#endif
{
gl_log(GL_VERBOSE,
"calling deInterleaveSlice width: %d, height: %d, stride: %d, channels: %d\n",
width,
height,
stride,
channels);
for (int y = 0; y < tile_y; y++) {
for (int x = 0; x < tile_x; x++) {
const int tiles = slice * tile_x * tile_y + y * tile_x + x;
const int slice_channels = std::min(4, input_channels - 4 * tiles);
deInterleaveSlice(
output + i * input_channels * output_size + 4 * tiles * output_size,
(float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
input_width,
input_height,
stride,
slice_channels);
}
}
}
});
}
}
return true;
}
};
REGISTER_CPU_OPERATOR(CopyFromOpenGL, CopyFromOpenGLOp<float16_t>);
OPERATOR_SCHEMA(CopyFromOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
} // namespace caffe2

View File

@ -1,462 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLReduce : public GLFilter {
public:
binding* inputSize;
binding* outputSize;
binding* tileSize;
binding* inv_pixel_count;
binding* epsilon;
binding* inputData;
binding* averageData;
bool compute_inv_stdev;
bool compute_norm;
const std::vector<binding*> input_bindings(bool compute_norm_) {
std::vector<binding*> bindings({BINDING(inputSize),
BINDING(outputSize),
BINDING(tileSize),
BINDING(inv_pixel_count),
BINDING(epsilon),
BINDING(inputData)});
if (compute_norm_) {
bindings.push_back(BINDING(averageData));
}
return bindings;
}
GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
: GLFilter(
"GLReduce",
vertex_shader,
fragment_shader,
input_bindings(compute_norm_),
{/* no uniform_blocks_bindings */},
{/* no attributes */},
{{"COMPUTE_INV_STDEV", c10::to_string((int)compute_inv_stdev_)},
{"COMPUTE_NORM", c10::to_string((int)compute_norm_)}}),
compute_inv_stdev(compute_inv_stdev_),
compute_norm(compute_norm_) {}
template <typename T>
void reduce(const GLImage<T>* input_image,
const GLImage<T>* output_image,
int tile_size_x,
int tile_size_y,
float inv_pixel_count_ = 1.0,
float epsilon_ = 0.0);
template <typename T>
void norm(const GLImage<T>* input_image,
const GLImage<T>* avg_image,
const GLImage<T>* output_image,
int tile_size_x,
int tile_size_y,
float inv_pixel_count_);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLReduce::fragment_shader = R"GLSL(#version 300 es
#define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV)
#define COMPUTE_NORM $(COMPUTE_NORM)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 inputSize;
uniform ivec2 outputSize;
uniform ivec2 tileSize;
uniform float inv_pixel_count;
uniform float epsilon;
#if COMPUTE_NORM
TEXTURE_INPUT(averageData);
#endif
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize));
ivec2 texelCoord = outputCoord * tileSize;
ivec2 sumArea = min(tileSize, inputSize - texelCoord);
highp vec4 sum = vec4(0.0);
#if COMPUTE_NORM
vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
#endif
for (int y = 0; y < sumArea.y; y++) {
for (int x = 0; x < sumArea.x; x++) {
ivec2 idx = texelCoord + ivec2(x, y);
vec4 val = TEXTURE_LOAD(inputData, idx);
#if COMPUTE_NORM
val -= avg;
sum += val * val;
#else
sum += val;
#endif
}
}
#if COMPUTE_INV_STDEV
outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon)));
#elif COMPUTE_NORM
outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count));
#else
outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon));
#endif
}
)GLSL";
template <typename T>
void GLReduce::reduce(const GLImage<T>* input_image,
const GLImage<T>* output_image,
int tile_size_x,
int tile_size_y,
float inv_pixel_count_,
float epsilon_) {
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(inputSize->location, input_image->width, input_image->height);
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform2i(tileSize->location, tile_size_x, tile_size_y);
glUniform1f(inv_pixel_count->location, inv_pixel_count_);
glUniform1f(epsilon->location, epsilon_);
},
output_image->width,
output_image->height);
}
}
template <typename T>
void GLReduce::norm(const GLImage<T>* input_image,
const GLImage<T>* avg_image,
const GLImage<T>* output_image,
int tile_size_x,
int tile_size_y,
float inv_pixel_count_) {
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments(
{{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(inputSize->location, input_image->width, input_image->height);
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform2i(tileSize->location, tile_size_x, tile_size_y);
glUniform1f(inv_pixel_count->location, inv_pixel_count_);
},
output_image->width,
output_image->height);
}
}
class GLScale : public GLFilter {
public:
binding* outputSize;
binding* inputData;
binding* averageData;
binding* normData;
binding* scale_factor;
binding* bias_factor;
binding* prelu_scale_factor;
const int channels;
const float* scale;
const float* bias;
const float* prelu_scale;
const int prelu_size;
const std::vector<binding*> input_bindings(bool fuse_prelu) {
std::vector<binding*> bindings({BINDING(outputSize),
BINDING(scale_factor),
BINDING(bias_factor),
BINDING(inputData),
BINDING(averageData),
BINDING(normData)});
if (fuse_prelu) {
bindings.push_back(prelu_scale_factor = new binding({"prelu_scale_factor"}));
}
return bindings;
}
GLScale(
const int _channels,
const float* _scale,
const float* _bias,
const float* _prelu_scale = nullptr,
const int _prelu_size = 0)
: GLFilter(
"GLScale",
vertex_shader,
fragment_shader,
input_bindings(_prelu_scale != nullptr),
{/* no uniform blocks */},
{/* no attributes */},
{{"FUSE_PRELU", c10::to_string(_prelu_scale != nullptr)}}),
channels(_channels),
scale(_scale),
bias(_bias),
prelu_scale(_prelu_scale),
prelu_size(_prelu_size) {}
template <typename T>
void scale_and_shift(const GLImage<T>* input_image,
const GLImage<T>* avg_image,
const GLImage<T>* norm_image,
const GLImage<T>* output_image);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLScale::fragment_shader = R"GLSL(#version 300 es
#define FUSE_PRELU $(FUSE_PRELU)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
uniform vec4 scale_factor;
uniform vec4 bias_factor;
#if FUSE_PRELU
uniform vec4 prelu_scale_factor;
#endif
TEXTURE_INPUT(inputData);
TEXTURE_INPUT(averageData);
TEXTURE_INPUT(normData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0));
#if FUSE_PRELU
vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor;
vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0))));
outputData = TEXTURE_STORE(o);
#else
vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor;
outputData = TEXTURE_STORE(o);
#endif
}
)GLSL";
template <typename T>
void GLScale::scale_and_shift(const GLImage<T>* input_image,
const GLImage<T>* avg_image,
const GLImage<T>* norm_image,
const GLImage<T>* output_image) {
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
{avg_image->textures[is], averageData},
{norm_image->textures[is], normData}});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform4f(scale_factor->location,
scale[4 * is],
channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
glUniform4f(bias_factor->location,
bias[4 * is],
channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
if (prelu_scale != nullptr) {
glUniform4f(prelu_scale_factor->location,
prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
: prelu_scale[0],
channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
: prelu_scale[0],
channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
: prelu_scale[0]);
}
},
output_image->width,
output_image->height);
}
}
namespace caffe2 {
template <class T, bool FUSE_PRELU>
class OpenGLInstanceNormPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
const int tile_size_x = 16;
const int tile_size_y = 16;
int avg_buf_width = input_width;
int avg_buf_height = input_height;
vector<GLImageVector<T>*> reduce_buf;
while (reduce_buf.size() == 0 ||
(avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
reduce_buf.push_back(
ImageAllocator<T>::newImage(1, avg_buf_width, avg_buf_height, output_channels));
}
GLImageVector<T>* avg = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
GLImageVector<T>* inv_stdev = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
const float* prelu_data = nullptr;
int prelu_size = 0;
if (FUSE_PRELU) {
DCHECK_EQ(InputSize(), 4);
const auto& prelu_scale = Input(PRELU);
prelu_data = prelu_scale.template data<float>();
prelu_size = prelu_scale.size();
} else {
DCHECK_EQ(InputSize(), 3);
}
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
if (!f_reduce) {
f_reduce.reset(new GLReduce());
f_norm.reset(new GLReduce(false, true));
f_stdDev.reset(new GLReduce(true, false));
f_scale.reset(new GLScale(input_channels,
scale.template data<float>(),
bias.template data<float>(),
prelu_data,
prelu_size));
}
for (int i = 0; i < num_images; i++) {
for (int k = 0; k < reduce_buf.size() + 1; k++) {
const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
float norm = k < reduce_buf.size()
? 1.0 / (tile_size_x * tile_size_y)
: (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
(float)(input_width * input_height);
const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
}
for (int k = 0; k < reduce_buf.size() + 1; k++) {
const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
float norm = k < reduce_buf.size()
? 1.0 / (tile_size_x * tile_size_y)
: (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
(float)(input_width * input_height);
if (k == 0) {
f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
} else if (k < reduce_buf.size()) {
f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
} else {
const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
}
}
f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
}
Outputs()[OUTPUT]->Reset(output);
if (OutputSize() > 1) {
Outputs()[MEAN]->Reset(avg);
Outputs()[INV_STDEV]->Reset(inv_stdev);
} else {
delete avg;
delete inv_stdev;
}
for (auto&& rb : reduce_buf) {
delete rb;
}
return true;
}
private:
float epsilon_;
StorageOrder order_;
std::unique_ptr<GLReduce> f_reduce;
std::unique_ptr<GLReduce> f_norm;
std::unique_ptr<GLReduce> f_stdDev;
std::unique_ptr<GLScale> f_scale;
INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
};
REGISTER_CPU_OPERATOR(OpenGLInstanceNorm, OpenGLInstanceNormPReluOp<float16_t, false>);
OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
REGISTER_CPU_OPERATOR(OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp<float16_t, true>);
OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
} // namespace caffe2

View File

@ -1,120 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
class GLMul : public GLFilter {
public:
binding* outputSize;
binding* inputData;
binding* B;
GLMul()
: GLFilter("GLMul",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize), BINDING(inputData), BINDING(B)}),
{/* no uniform blocks */},
{/* no attributes */},
{/* no replacements */}) {}
template <typename T>
void mul(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images, float b);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLMul::fragment_shader = R"GLSL(#version 300 es
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
uniform vec4 B;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 A = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(A * B);
}
)GLSL";
template <typename T>
void GLMul::mul(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
float b) {
for (int i = 0; i < input_images.size(); i++) {
auto input_image = input_images[i];
auto output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform4f(B->location, b, b, b, b);
},
output_image->width,
output_image->height);
}
}
}
namespace caffe2 {
template <class T>
class OpenGLMulOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLMulOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
"OpenGLMul only supports broadcast");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
"OpenGLMul does not support axis");
}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
const auto& B = Input(1);
CAFFE_ENFORCE_EQ(B.size(), 1); // only scalar is supported
const int num_images = input.size();
const auto output_height = input.height();
const auto output_width = input.width();
const int output_channels = input.channels();
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
if (!_mult) {
_mult.reset(new GLMul());
}
_mult->mul(input, *output, B.template data<float>()[0]);
Outputs()[0]->Reset(output);
return true;
}
private:
std::unique_ptr<GLMul> _mult;
};
REGISTER_CPU_OPERATOR(OpenGLMul, OpenGLMulOp<float16_t>);
} // namespace caffe2

View File

@ -1,142 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLNormPlanarYUV : public GLFilter {
public:
const float* mean;
const float* std;
binding* inputData;
binding* outputSize;
binding* mean_data;
binding* std_data;
GLNormPlanarYUV(const float* _mean, const float* _std)
: GLFilter("GLNormPlanarYUV",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData),
BINDING(outputSize),
BINDING(mean_data),
BINDING(std_data)}), // input bindings
{/* no uniform blocks */},
{/* no attributes */},
{}),
mean(_mean),
std(_std) {}
template <typename T>
void normalize(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLNormPlanarYUV::fragment_shader = R"GLSL(#version 300 es
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
uniform vec4 mean_data;
uniform vec4 std_data;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE((value - mean_data) / std_data);
}
)GLSL";
template <class T>
void GLNormPlanarYUV::normalize(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images) {
int num_images = input_images.size();
for (int i = 0; i < num_images; i++) {
GLImage<T>* input_image = input_images[i];
GLImage<T>* output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform4f(mean_data->location, mean[0], mean[1], mean[2], 0.0);
glUniform4f(std_data->location, std[0], std[1], std[2], 1.0);
},
output_image->width,
output_image->height);
}
}
}
namespace caffe2 {
template <typename T>
class GLNormPlanarYUVOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
GLNormPlanarYUVOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
const auto& M = Input(1); // mean
const auto& S = Input(2); // standard deviation
CAFFE_ENFORCE(input_channels == M.dim(1));
CAFFE_ENFORCE(input_channels == S.dim(1));
if (!_normPlanarYUV) {
_normPlanarYUV.reset(new GLNormPlanarYUV(M.template data<float>(), S.template data<float>()));
}
_normPlanarYUV->normalize(input, *output);
Outputs()[0]->Reset(output);
return true;
}
private:
StorageOrder order_;
std::unique_ptr<GLNormPlanarYUV> _normPlanarYUV;
};
REGISTER_CPU_OPERATOR(OpenGLNormalizePlanarYUV, GLNormPlanarYUVOp<float16_t>);
OPERATOR_SCHEMA(OpenGLNormalizePlanarYUV).NumInputs(3).NumOutputs(1);
} // namespace caffe2

View File

@ -1,273 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLPRelu : public GLFilter {
public:
typedef enum { PRelu = 0, Relu = 1 } ReluType;
const float* scale;
binding* inputData;
binding* scale_block;
const int scale_size;
const int channels;
const int output_tile_x;
const int output_tile_y;
const int output_tile_width;
const int output_tile_height;
GLPRelu(
const float* _scale,
const int _scale_size,
const int _channels,
int _output_tile_x,
int _output_tile_y,
int _output_tile_width,
int _output_tile_height)
: GLFilter(
"GLPRelu",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData)}),
std::vector<binding*>({BINDING(scale_block)}),
{/* no attributes */},
{{"USE_RELU", c10::to_string(PRelu)},
{"OUTPUT_TILES", c10::to_string(_output_tile_x * _output_tile_y)},
{"OUTPUT_TILE_X", c10::to_string(_output_tile_x)},
{"OUTPUT_TILE_WIDTH", c10::to_string(_output_tile_width)},
{"OUTPUT_TILE_HEIGHT", c10::to_string(_output_tile_height)},
{"TILED_PRELU",
c10::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
scale(_scale),
scale_size(_scale_size),
channels(_channels),
output_tile_x(_output_tile_x),
output_tile_y(_output_tile_y),
output_tile_width(_output_tile_width),
output_tile_height(_output_tile_height) {}
GLPRelu(const int _channels)
: GLFilter(
"GLRelu",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData)}),
{/* no uniform blocks */},
{/* no attributes */},
{{"USE_RELU", c10::to_string(Relu)},
{"OUTPUT_TILES", c10::to_string(1)},
{"OUTPUT_TILE_X", c10::to_string(1)},
{"OUTPUT_TILE_WIDTH", c10::to_string(1)},
{"OUTPUT_TILE_HEIGHT", c10::to_string(1)},
{"TILED_PRELU", c10::to_string(0)}}),
scale(nullptr),
scale_block(nullptr),
scale_size(0),
channels(_channels),
output_tile_x(1),
output_tile_y(1),
output_tile_width(1),
output_tile_height(1) {}
template <typename T>
void prelu(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
GLPRelu::ReluType reluType);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
#define TILED_PRELU $(TILED_PRELU)
#define USE_RELU $(USE_RELU)
// tiling
#define OUTPUT_TILES $(OUTPUT_TILES)
#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
// common
precision mediump float;
precision highp int;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
in highp vec2 v_texCoord;
#if USE_RELU
// Relu
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(max(value, vec4(0.0)));
}
#else
#if TILED_PRELU
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
layout (std140) uniform scale_block {
highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
};
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
// outputData = value > 0 ? value : value * weight;
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
outputData = TEXTURE_STORE(value);
}
#else
layout (std140) uniform scale_block {
highp uvec4 scale;
};
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
// outputData = value > 0 ? value : value * weight;
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
outputData = TEXTURE_STORE(value);
}
#endif // TILED_PRELU
#endif // USE_RELU
)GLSL";
template <typename T>
void GLPRelu::prelu(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
GLPRelu::ReluType reluType) {
int num_images = input_images.size();
for (int i = 0; i < num_images; i++) {
GLImage<T>* input_image = input_images[i];
GLImage<T>* output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
if (reluType == PRelu) {
attach_uniform_buffer<float16_t>(scale_block, 0, [&](float16_t* data, size_t size) {
int output_tiles = output_tile_x * output_tile_y;
for (int j = 0, k = 4 * is * output_tiles;
k < std::min(channels, 4 * (is + 1) * output_tiles);
j++, k++) {
data[j] = scale_size == channels ? scale[k] : scale[0];
}
});
}
std::vector<texture_attachment> input_attachments;
input_attachments.push_back({input_image->textures[is], inputData});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {},
output_image->texture_width,
output_image->texture_height);
}
}
}
namespace caffe2 {
template <typename T, GLPRelu::ReluType reluType>
class OpenGLPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLPReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
if (input_tile_x > 1 || input_tile_y > 1) {
CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
}
GLImageVector<T>* output = ImageAllocator<T>::newImage(num_images,
output_width,
output_height,
output_channels,
output_tile_x,
output_tile_y,
is_last);
const auto* scale = reluType == GLPRelu::PRelu ? &Input(1) : nullptr;
if (!_prelu) {
if (reluType == GLPRelu::PRelu) {
_prelu.reset(new GLPRelu(scale->template data<float>(),
scale->size(),
input_channels,
output_tile_x,
output_tile_y,
output_width,
output_height));
} else {
_prelu.reset(new GLPRelu(input_channels));
}
}
_prelu->prelu(input, *output, reluType);
Outputs()[0]->Reset(output);
return true;
}
private:
StorageOrder order_;
std::unique_ptr<GLPRelu> _prelu;
};
REGISTER_CPU_OPERATOR(OpenGLPRelu, OpenGLPReluOp<float16_t, GLPRelu::PRelu>);
OPERATOR_SCHEMA(OpenGLPRelu)
.NumInputs(2)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.IdenticalTypeAndShape();
REGISTER_CPU_OPERATOR(OpenGLRelu, OpenGLPReluOp<float16_t, GLPRelu::Relu>);
OPERATOR_SCHEMA(OpenGLRelu)
.NumInputs(1)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.IdenticalTypeAndShape();
} // namespace caffe2

View File

@ -1,159 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "caffe2/operators/conv_pool_op_base.h"
class GLPadImage : public GLFilter {
public:
binding* padSize;
binding* inputSize;
binding* outputSize;
binding* inputData;
GLPadImage()
: GLFilter(
"GLPadImage",
vertex_shader,
fragment_shader,
std::vector<binding*>(
{BINDING(padSize), BINDING(inputSize), BINDING(outputSize), BINDING(inputData)}),
{/* no uniform blocks */},
{/* no attributes */},
{/* no replacements */}) {}
template <typename T>
void pad(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
const int pad_l,
const int pad_t);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLPadImage::fragment_shader = R"GLSL(#version 300 es
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 padSize;
uniform ivec2 inputSize;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)) - padSize;
texelCoord = max(texelCoord, -texelCoord);
texelCoord = min(texelCoord, ivec2(2) * (inputSize - 1) - texelCoord);
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(value);
}
)GLSL";
template <typename T>
void GLPadImage::pad(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
const int pad_l,
const int pad_t) {
for (int i = 0; i < input_images.size(); i++) {
auto input_image = input_images[i];
auto output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(inputSize->location, input_image->width, input_image->height);
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform2i(padSize->location, pad_l, pad_t);
},
output_image->width,
output_image->height);
}
}
}
namespace caffe2 {
template <typename OPBase>
static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
Tensor<CPUContext> input, output;
input.Resize(1, 1, H, W);
op->SetOutputSize(input, &output, 1);
CAFFE_ENFORCE_EQ(output.ndim(), 4);
*OH = output.dim(2);
*OW = output.dim(3);
}
template <class T>
class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
public:
OpenGLPadImageOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws),
mode_(OperatorBase::GetSingleArgument<string>("mode", "")) {
OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
OPERATOR_NEEDS_FEATURE(mode_ == "reflect", "OpenGL only supports reflection");
CAFFE_ENFORCE(legacy_pad_ == LegacyPadding::NOTSET,
"Padding layer only supports explicit pad values.");
CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
"Pooling op does not support dilation right now.");
CAFFE_ENFORCE(stride_h() == 1 && stride_w() == 1,
"Pooling op does not support stride right now.");
// Pad op does not use kernel sizes, so we set it to 1 for computing the
// output size.
kernel_.assign(pads_.size() / 2, 1);
}
bool RunOnDeviceWithOrderNCHW() override {
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_width = input.width();
const int input_height = input.height();
const int input_channels = input.channels();
const int output_channels = input_channels;
int output_height, output_width;
computeOutputHW(this, input_height, input_width, &output_height, &output_width);
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
if (!padImage_) {
padImage_.reset(new GLPadImage());
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
<< output_channels << ": " << output_height << " X " << output_width;
LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
<< ", pad_b = " << pad_b();
}
padImage_->pad(input, *output, pad_l(), pad_t());
Outputs()[0]->Reset(output);
return true;
}
private:
std::string mode_;
std::unique_ptr<GLPadImage> padImage_;
};
REGISTER_CPU_OPERATOR(OpenGLPadImage, OpenGLPadImageOp<float16_t>);
OPERATOR_SCHEMA(OpenGLPadImage).NumInputs(1).NumOutputs(1);
} // namespace caffe2

View File

@ -1,339 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/timer.h"
#include "caffe2/operators/pool_op.h"
class GLPool : public GLFilter {
public:
typedef enum { AveragePool, MaxPool } PoolType;
struct point {
int x;
int y;
};
struct descriptor {
int channels;
point kernel_size;
point input_padding;
point input_stride;
point input_tile_size;
point output_tile_size;
};
binding* inputData;
binding* kernelSize;
binding* outputSize;
const descriptor geometry;
GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
: GLFilter(
"GLPool",
vertex_shader,
fragment_shader,
{
BINDING(inputData),
BINDING(kernelSize),
BINDING(outputSize),
},
{/* no uniform blocks */},
{/* no attributes */},
{{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
{"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
{"INPUT_PADDING_X", c10::to_string(_geometry.input_padding.x)},
{"INPUT_PADDING_Y", c10::to_string(_geometry.input_padding.y)},
{"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
{"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
{"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
{"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
{"OUTPUT_TILE_WIDTH",
c10::to_string(_geometry.output_tile_size.x)},
{"OUTPUT_TILE_HEIGHT",
c10::to_string(_geometry.output_tile_size.y)},
{"TILED_POOLING", c10::to_string(_tiling)},
{"MAX_POOL", c10::to_string(poolType == MaxPool)},
{"BOUNDS_CHECK_MODE", c10::to_string(1)}}),
geometry(_geometry) {}
~GLPool() {}
void pool(const GLImageVector<float16_t>& input_images,
const GLImageVector<float16_t>& output_images) {
for (int i = 0; i < input_images.size(); i++) {
auto input_image = input_images[i];
auto output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
run({{input_image->textures[is], inputData}},
{output_image->textures[is]},
[&]() {
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
},
output_image->texture_width,
output_image->texture_height);
}
}
}
private:
/*
* Computes BOUNDS_CHECK_MODE for the convolution parameters.
*
* @retval 0 if bounds check can be skipped
* @retval non-zero if bounds check can not be skipped
*/
inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
if (tiling) {
return 1;
}
if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
(geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
return 0;
} else {
return 1;
}
}
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLPool::fragment_shader = R"GLSL(#version 300 es
#define TILED_POOLING $(TILED_POOLING)
#define MAX_POOL $(MAX_POOL)
// tiling
#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
#define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
uniform ivec2 kernelSize;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
#if BOUNDS_CHECK_MODE == 0
#define IN_BOUNDS(p, p0, p1) (true)
#else
#define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
#endif
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
const float MIN_FLOAT = -exp2(14.0);
#if TILED_POOLING
const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
// tiled pooling
#if MAX_POOL
#define POOL { \
pool = vec4(MIN_FLOAT); \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = tileCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
pool = max(pool, data); \
} \
} \
} \
}
#else
#define POOL { \
int count = 0; \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = tileCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
pool += data;\
count += 1; \
} \
} \
} \
pool = pool / float(count); \
}
#endif // MAX_POOL
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
tileCoord = input_stride * tileCoord - input_padding;
ivec2 inputTileOffset = tile * inputTileSize;
#if MAX_POOL
vec4 pool = vec4(0);
#else
highp vec4 pool = vec4(0);
#endif
POOL;
outputData = TEXTURE_STORE(pool);
}
#else
// no tiling
#if MAX_POOL
#define POOL { \
pool = vec4(MIN_FLOAT); \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = texelCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputSize) { \
vec4 data = TEXTURE_LOAD(inputData, idx); \
pool = max(pool, data); \
} \
} \
} \
}
#else
#define POOL { \
int count = 0; \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = texelCoord + ivec2(x, y); \
if IN_BOUNDS(idx, ivec2(0), inputSize) { \
vec4 data = TEXTURE_LOAD(inputData, idx); \
pool += data; \
count += 1; \
} \
} \
} \
pool = pool / float(count); \
}
#endif // MAX_POOL
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
#if MAX_POOL
vec4 pool = vec4(0);
#else
highp vec4 pool = vec4(0);
#endif
POOL;
outputData = TEXTURE_STORE(pool);
}
#endif // TILED_POOLING
)GLSL";
namespace caffe2 {
template <typename OPBase>
static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
Tensor<CPUContext> input, output;
input.Resize(1, 1, H, W);
op->SetOutputSize(input, &output, 1);
CAFFE_ENFORCE_EQ(output.ndim(), 4);
*OH = output.dim(2);
*OW = output.dim(3);
}
template <typename T, GLPool::PoolType poolType>
class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
public:
GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
"Pooling op does not support dilation right now.");
if (!global_pooling_) {
CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
pad_r() < kernel_w(),
"Pad should be smaller than kernel.");
}
}
bool RunOnDeviceWithOrderNCHW() override {
const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
int output_height;
int output_width;
const int output_channels = input_channels;
computeOutputHW(this, input_height, input_width, &output_height, &output_width);
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
GLPool::descriptor geometry{input_channels,
{kernel_w(), kernel_h()},
{pad_l(), pad_t()},
{stride_w(), stride_h()},
{input_width, input_height},
{output_height, output_width}};
if (!glPool_) {
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
<< output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
<< " Tiling: " << input_tile_x << "X" << input_tile_y;
glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
}
glPool_->pool(input, *output);
OperatorBase::Outputs()[0]->Reset(output);
return true;
}
private:
std::unique_ptr<GLPool> glPool_;
};
namespace {
REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
}; // namespace
}; // namespace caffe2

View File

@ -1,135 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLResizeNearest : public GLFilter {
public:
binding* inputData;
binding* outputSize;
binding* scale_reverse;
GLResizeNearest()
: GLFilter("GLResizeNearest",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
{/* no uniform blocks*/},
{/* no attributes */},
{/* replacements */}) {}
template <typename T>
void resize(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
float width_scale_rev,
float height_scale_rev);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
uniform highp vec2 scale_reverse;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
// it clamps to the edge by default
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(value);
}
)GLSL";
template <typename T>
void GLResizeNearest::resize(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
float width_scale_rev,
float height_scale_rev) {
for (int i = 0; i < input_images.size(); i++) {
auto input_image = input_images[i];
auto output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
},
output_image->texture_width,
output_image->texture_height);
}
}
}
namespace caffe2 {
template <class T>
class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
if (HasArgument("width_scale")) {
width_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("width_scale", 1));
}
if (HasArgument("height_scale")) {
height_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("height_scale", 1));
}
}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_width = input.width();
const int input_height = input.height();
const int input_channels = input.channels();
const int output_width = input_width * width_scale_;
const int output_height = input_height * height_scale_;
const int output_channels = input_channels;
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
if (!resizeNearest_) {
resizeNearest_.reset(new GLResizeNearest());
}
resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
Outputs()[0]->Reset(output);
return true;
}
protected:
float width_scale_;
float height_scale_;
std::unique_ptr<GLResizeNearest> resizeNearest_;
};
REGISTER_CPU_OPERATOR(OpenGLResizeNearest, OpenGLResizeNearestOp<float16_t>);
OPERATOR_SCHEMA(OpenGLResizeNearest).NumInputs(1).NumOutputs(1);
} // namespace caffe2

View File

@ -1,135 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
typedef enum { Sigmoid, Tanh } OpType;
class GLSigmoid : public GLFilter {
public:
binding* inputData;
binding* outputSize;
GLSigmoid(OpType opType)
: GLFilter(
"GLSigmoid",
vertex_shader,
fragment_shader,
{BINDING(outputSize), BINDING(inputData)},
{/* no uniform blocks */},
{/* no attributes */},
{{"SIGMOID", c10::to_string(opType == Sigmoid)},
{"TANH", c10::to_string(opType == Tanh)}}) {}
template <typename T>
void sigmoid(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLSigmoid::fragment_shader = R"GLSL(#version 300 es
#define SIGMOID $(SIGMOID)
#define TANH $(TANH)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 value = TEXTURE_LOAD(inputData, ivec2(texelCoord));
#if SIGMOID
value = vec4(1.0) / (vec4(1.0) + exp(-value));
outputData = TEXTURE_STORE(value);
#elif TANH
value = tanh(value);
outputData = TEXTURE_STORE(value);
#endif
}
)GLSL";
template <typename T>
void GLSigmoid::sigmoid(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images) {
for (int i = 0; i < input_images.size(); i++) {
auto input_image = input_images[i];
auto output_image = output_images[i];
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
output_image->width,
output_image->height);
}
}
}
namespace caffe2 {
template <typename T, OpType opType>
class OpenGLSigmoidOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
if (!_sigmoid) {
_sigmoid.reset(new GLSigmoid(opType));
}
_sigmoid->sigmoid(input, *output);
Outputs()[0]->Reset(output);
return true;
}
private:
std::unique_ptr<GLSigmoid> _sigmoid;
};
REGISTER_CPU_OPERATOR(OpenGLSigmoid, OpenGLSigmoidOp<float16_t, Sigmoid>);
OPERATOR_SCHEMA(OpenGLSigmoid)
.NumInputs(1)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.IdenticalTypeAndShape();
REGISTER_CPU_OPERATOR(OpenGLTanh, OpenGLSigmoidOp<float16_t, Tanh>);
OPERATOR_SCHEMA(OpenGLTanh)
.NumInputs(1)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.IdenticalTypeAndShape();
} // namespace caffe2

View File

@ -1,434 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLSoftmaxReduce : public GLFilter {
public:
binding* inputTileSize;
binding* outputSize;
binding* outputTileSize;
binding* tileSize;
binding* spatialTileSize;
binding* inputTileRange;
binding* inputData;
binding* maxData;
binding* sumData;
const std::vector<binding*> input_bindings() {
std::vector<binding*> bindings({BINDING(inputTileSize),
BINDING(outputSize),
BINDING(outputTileSize),
BINDING(tileSize),
BINDING(spatialTileSize),
BINDING(inputTileRange),
BINDING(inputData),
BINDING(maxData),
BINDING(sumData)});
return bindings;
}
GLSoftmaxReduce(
bool compute_sum_ = false,
bool tiled = false,
int input_tile_x = 1)
: GLFilter(
"GLSoftmaxReduce",
vertex_shader,
fragment_shader,
input_bindings(),
{/* no uniform_blocks_bindings */},
{/* no attributes */},
{{"COMPUTE_SUM", c10::to_string((int)compute_sum_)},
{"INPUT_TILE_X", c10::to_string(input_tile_x)},
{"TILED_SOFTMAX", c10::to_string(int(tiled))}}) {}
template <typename T>
void reduce(const GLImage<T>* input_image,
const GLImage<T>* output_image,
int tile_size_x,
int tile_size_y);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLSoftmaxReduce::fragment_shader = R"GLSL(#version 300 es
#define TILED_SOFTMAX $(TILED_SOFTMAX)
#define INPUT_TILE_X $(INPUT_TILE_X)
// Compute sum or max
#define COMPUTE_SUM $(COMPUTE_SUM)
precision highp float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 inputTileSize;
uniform ivec2 outputSize;
uniform ivec2 outputTileSize;
uniform ivec2 spatialTileSize;
uniform ivec2 tileSize;
uniform ivec2 inputTileRange;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
#if TILED_SOFTMAX
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize);
vec4 result = vec4(0.0);
for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) {
int inTileX = tileIdx % INPUT_TILE_X;
int inTileY = tileIdx / INPUT_TILE_X;
ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
for (int y = 0; y < sumArea.y; y++) {
for (int x = 0; x < sumArea.x; x++) {
ivec2 idx = tileCoord + ivec2(x, y);
vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx);
#if COMPUTE_SUM
result += val;
#else
result = max(result, val);
#endif
}
}
}
outputData = TEXTURE_STORE(result);
}
#else
void main() {
ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize));
ivec2 texelCoord = outputCoord * spatialTileSize;
ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord);
vec4 result = vec4(0.0);
for (int y = 0; y < sumArea.y; y++) {
for (int x = 0; x < sumArea.x; x++) {
ivec2 idx = texelCoord + ivec2(x, y);
vec4 val = TEXTURE_LOAD(inputData, idx);
#if COMPUTE_SUM
result += val;
#else
result = max(result, val);
#endif
}
}
outputData = TEXTURE_STORE(result);
}
#endif
)GLSL";
template <typename T>
void GLSoftmaxReduce::reduce(const GLImage<T>* input_image,
const GLImage<T>* output_image,
int tile_size_x,
int tile_size_y) {
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
run(input_attachments,
{output_image->textures.begin() + is,
output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(
inputTileSize->location, input_image->width, input_image->height);
glUniform2i(
outputSize->location,
output_image->texture_width,
output_image->texture_height);
glUniform2i(
outputTileSize->location,
output_image->width,
output_image->height);
glUniform2i(
tileSize->location, input_image->tile_x, input_image->tile_y);
glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
glUniform2i(
inputTileRange->location,
0,
std::min(
(input_image->channels + 3) / 4,
input_image->tile_x * input_image->tile_y));
},
output_image->texture_width,
output_image->texture_height);
}
}
class GLSoftmaxScale : public GLFilter {
public:
binding* outputSize;
binding* inputData;
binding* maxData;
binding* sumData;
const std::vector<binding*> input_bindings() {
std::vector<binding*> bindings(
{BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
return bindings;
}
GLSoftmaxScale(bool _compute_exp = false, bool tiled = false)
: GLFilter(
"GLSoftmaxScale",
vertex_shader,
fragment_shader,
input_bindings(),
{/* no uniform blocks */},
{/* no attributes */},
{{"COMPUTE_EXP", c10::to_string((int)_compute_exp)},
{"TILED_SOFTMAX", c10::to_string((int)tiled)}}) {}
template <typename T>
void scale(const GLImage<T>* input_image,
const GLImage<T>* max_image,
const GLImage<T>* sum_image,
const GLImage<T>* output_image);
static const char* fragment_shader;
};
template <typename T>
void GLSoftmaxScale::scale(const GLImage<T>* input_image,
const GLImage<T>* max_image,
const GLImage<T>* sum_image,
const GLImage<T>* output_image) {
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
{max_image->textures[is], maxData},
{sum_image->textures[is], sumData}});
run(input_attachments,
{output_image->textures.begin() + is,
output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(
outputSize->location,
output_image->texture_width,
output_image->texture_height);
},
output_image->texture_width,
output_image->texture_height);
}
}
// MARK: GLSL
const char* GLSoftmaxScale::fragment_shader = R"GLSL(#version 300 es
#define COMPUTE_EXP $(COMPUTE_EXP)
#define TILED_SOFTMAX $(TILED_SOFTMAX)
precision highp float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData);
TEXTURE_INPUT(maxData);
TEXTURE_INPUT(sumData);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
#if COMPUTE_EXP
vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0));
#if TILED_SOFTMAX
float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w);
maxVal = vec4(singleMax, singleMax, singleMax, singleMax);
outputData = TEXTURE_STORE(exp(val - maxVal));
#else
outputData = TEXTURE_STORE(exp(val - maxVal));
#endif
#else
vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0));
#if TILED_SOFTMAX
float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w;
sumVal = vec4(singleSum, singleSum, singleSum, singleSum);
outputData = TEXTURE_STORE(val / sumVal);
#else
outputData = TEXTURE_STORE(val / sumVal);
#endif
#endif
}
)GLSL";
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
namespace caffe2 {
template <class T>
class OpenGLSoftmax final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLSoftmax(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
}
bool RunOnDevice() override {
const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
const int num_images = input.size();
const int input_channels = input.channels();
const int input_width = input.width();
const int input_height = input.height();
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
// For tiling
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
const bool tiled = input_tile_x > 1 || input_tile_y > 1;
if (tiled) {
CAFFE_ENFORCE_EQ(
input.slices(), 1, "Input needs to be tiled in a single texture");
}
CAFFE_ENFORCE(
tiled || input_channels == 1,
"Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
// for spatial dimension
const int tile_size_x = 16;
const int tile_size_y = 16;
int max_buf_width = input_width;
int max_buf_height = input_height;
int max_buf_channels = input_channels;
vector<GLImageVector<T>*> reduce_buf;
while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
if (tiled) {
// since we are summing over all the channels within a channel tile
max_buf_channels =
(max_buf_channels + input_tile_x * input_tile_y - 1) /
(input_tile_x + input_tile_y);
}
reduce_buf.push_back(ImageAllocator<T>::newImage(
1,
max_buf_width,
max_buf_height,
max_buf_channels,
output_tile_x,
output_tile_y));
}
GLImageVector<T>* max = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
GLImageVector<T>* sum = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
GLImageVector<T>* after_exp = ImageAllocator<T>::newImage(
num_images,
output_width,
output_height,
output_channels,
output_tile_x,
output_tile_y);
GLImageVector<T>* output_images = ImageAllocator<T>::newImage(
num_images,
output_width,
output_height,
output_channels,
output_tile_x,
output_tile_y,
is_last);
if (!f_max) {
f_max.reset(new GLSoftmaxReduce(false, tiled, input_tile_x));
f_exp.reset(new GLSoftmaxScale(true, tiled));
f_sum.reset(new GLSoftmaxReduce(true, tiled, input_tile_x));
f_scale.reset(new GLSoftmaxScale(false, tiled));
}
for (int i = 0; i < num_images; i++) {
auto input_image = input[i];
auto max_image = (*max)[i];
auto sum_image = (*sum)[i];
auto after_exp_image = (*after_exp)[i];
auto output_image = (*output_images)[i];
// Get Max
for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
const int running_tile_size_x =
ir < reduce_buf.size() ? tile_size_x : in->width;
const int running_tile_size_y =
ir < reduce_buf.size() ? tile_size_y : in->height;
f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
}
// scale vals by exp(x - max)
f_exp->scale(input_image, max_image, sum_image, after_exp_image);
// Get sum of the exp
for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
}
// Scale(softmax)
f_scale->scale(after_exp_image, max_image, sum_image, output_image);
}
Outputs()[OUTPUT]->Reset(output_images);
delete sum;
delete max;
delete after_exp;
for (auto&& rb : reduce_buf) {
delete rb;
}
return true;
}
private:
StorageOrder order_;
std::unique_ptr<GLSoftmaxReduce> f_max;
std::unique_ptr<GLSoftmaxScale> f_exp;
std::unique_ptr<GLSoftmaxReduce> f_sum;
std::unique_ptr<GLSoftmaxScale> f_scale;
INPUT_TAGS(INPUT, FILTER, BIAS);
OUTPUT_TAGS(OUTPUT);
};
REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
OPERATOR_SCHEMA(OpenGLSoftmax)
.NumInputs(1)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.IdenticalTypeAndShape();
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -1,397 +0,0 @@
#include "../core/GLContext.h"
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
enum InputFormat { BGRA = 0, RGBA = 1 };
class GLStylizer : public GLFilter {
binding* inputData;
binding* outputSize;
binding* mean;
binding* noise_std;
bool deprocess;
public:
GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
: GLFilter(
_deprocess ? "GLDeStylizer" : "GLStylizer",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData),
BINDING(mean),
BINDING(noise_std),
BINDING(outputSize)}),
{/* no uniform blocks */},
{/* no attributes */},
{{"DEPROCESS", c10::to_string(_deprocess)},
{"RGBAINPUT", c10::to_string(input_format)}}),
deprocess(_deprocess) {}
template <typename T1, typename T2>
void stylize(const GLImage<T1>* input_image,
const GLImage<T2>* output_image,
const float mean_values[3],
float noise_std_value);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLStylizer::fragment_shader = R"GLSL(#version 300 es
#define DEPROCESS $(DEPROCESS)
#define RGBAINPUT $(RGBAINPUT)
precision mediump float;
precision mediump int;
precision mediump sampler2D;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
uniform vec3 mean;
uniform float noise_std;
#if DEPROCESS
TEXTURE_INPUT(inputData);
layout(location = 0) out mediump vec4 outputData;
#else
uniform sampler2D inputData;
TEXTURE_OUTPUT(0, outputData);
#endif
#if !DEPROCESS
// http://byteblacksmith.com/improvements-to-the-canonical-one-liner-glsl-rand-for-opengl-es-2-0/
highp float rand(vec2 co) {
highp float a = 12.9898;
highp float b = 78.233;
highp float c = 43758.5453;
highp float dt = dot(co.xy, vec2(a, b));
highp float sn = mod(dt, 3.14);
return fract(sin(sn) * c);
}
#endif
// In AR Engine, input/output a RBGA texture; otherwise, BGRA tensor => texture
#if RGBAINPUT
void main() {
#if DEPROCESS
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
outputData = vec4((val.rgb + mean) / 255.0, 1.0).bgra;
#else
outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).bgr - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
#endif
}
#else
void main() {
#if DEPROCESS
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
outputData = vec4((val.rgb + mean) / 255.0, 1.0);
#else
outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).rgb - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
#endif
}
#endif
)GLSL";
template <typename T1, typename T2>
void GLStylizer::stylize(const GLImage<T1>* input_image,
const GLImage<T2>* output_image,
const float mean_values[3],
float noise_std_value) {
int input_slices = input_image->slices;
int output_slices = output_image->slices;
run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
{output_image->textures[0]},
[&]() {
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
if (!deprocess) {
glUniform1f(noise_std->location, noise_std_value);
}
},
output_image->width,
output_image->height);
}
namespace caffe2 {
class OpenGLTensorToTextureStylizerPreprocessOp : public Operator<CPUContext>,
ImageAllocator<uint8_t>,
ImageAllocator<float16_t> {
public:
// Expect this many channels as input
static constexpr int kInputChannels = 4;
// Expect this many channels as output
static constexpr int kOutputChannels = 3;
USE_OPERATOR_BASE_FUNCTIONS;
OpenGLTensorToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() {
const auto& input = Input(0);
const auto& mean = Input(1);
CAFFE_ENFORCE(input.ndim() == 4);
const int num_images = input.dim32(0);
const int input_height = input.dim32(1);
const int input_width = input.dim32(2);
const int input_channels = input.dim32(3);
CAFFE_ENFORCE(input.dim32(0) == 1); // N == 1
CAFFE_ENFORCE(input_channels == kInputChannels);
CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
// get the buffers from input tensors
const float* mean_buffer = mean.template data<float>();
const uint8_t* input_buffer = input.template data<uint8_t>();
// set up the OpenGL context
GLContext::getGLContext()->set_context();
GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(num_images,
input_width,
input_height,
kOutputChannels,
#if CAFFE2_IOS
true
#else
false
#endif
);
const int tile_x = 1, tile_y = 1;
GLImageVector<uint8_t>* input_images = ImageAllocator<uint8_t>::newImage(
num_images, input_width, input_height, kInputChannels, tile_x, tile_y, false);
for (int i = 0; i < num_images; i++) {
auto input_image = (*input_images)[i];
auto output_image = (*output_images)[i];
const GLTexture* inputTexture = input_image->textures[0];
inputTexture->loadData(input_buffer);
if (!glStylizer_) {
glStylizer_.reset(new GLStylizer());
}
glStylizer_->stylize(
input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
}
delete input_images;
Outputs()[0]->Reset(output_images);
return true;
}
private:
std::unique_ptr<GLStylizer> glStylizer_;
};
template <InputFormat inputFormat>
class OpenGLTextureToTextureStylizerPreprocessOp : public Operator<CPUContext>,
ImageAllocator<uint8_t>,
ImageAllocator<float16_t> {
public:
// Expect this many channels as input
static constexpr int kInputChannels = 4;
// Expect this many channels as output
static constexpr int kOutputChannels = 3;
USE_OPERATOR_BASE_FUNCTIONS;
OpenGLTextureToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() {
const GLImageVector<uint8_t>& input = Inputs()[0]->template Get<GLImageVector<uint8_t>>();
const auto& mean = Input(1);
const int num_images = input.size();
const int input_height = input.height();
const int input_width = input.width();
const int input_channels = input.channels();
CAFFE_ENFORCE_GT(num_images, 0);
CAFFE_ENFORCE(input[0]->slices == 1); // N == 1
CAFFE_ENFORCE(input_channels == kInputChannels);
CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
// get the buffers from input tensors
const float* mean_buffer = mean.template data<float>();
GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(
num_images, input_width, input_height, kOutputChannels, false);
if (!glStylizer_) {
glStylizer_.reset(new GLStylizer(false, inputFormat));
}
for (int i = 0; i < num_images; i++) {
auto input_image = input[i];
auto output_image = (*output_images)[i];
glStylizer_->stylize(
input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
}
Outputs()[0]->Reset(output_images);
return true;
}
private:
std::unique_ptr<GLStylizer> glStylizer_;
};
REGISTER_CPU_OPERATOR(OpenGLTensorToTextureStylizerPreprocess,
OpenGLTensorToTextureStylizerPreprocessOp);
OPERATOR_SCHEMA(OpenGLTensorToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerPreprocess,
OpenGLTextureToTextureStylizerPreprocessOp<RGBA>);
OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
class OpenGLTextureToTensorStylizerDeprocessOp : public Operator<CPUContext>,
ImageAllocator<uint8_t> {
public:
using Operator<CPUContext>::Operator;
// Expect this many channels as input
static constexpr int kInputChannels = 3;
// Expect this many channels as output
static constexpr int kOutputChannels = 4;
bool RunOnDevice() {
const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
const auto& mean = Input(1);
auto* output = Output(0);
const int num_images = input.size(), channels = input.channels(), height = input.height(),
width = input.width();
// Assume BGR or BGRA
CAFFE_ENFORCE(mean.size() == kInputChannels);
CAFFE_ENFORCE(channels == kInputChannels);
// RGB
output->Resize(num_images, height, width, kOutputChannels);
const auto* mean_data = mean.template data<float>();
auto* output_buffer = output->template mutable_data<uint8_t>();
GLImageVector<uint8_t>* output_images =
ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels, true);
if (!glStylizer_) {
glStylizer_.reset(new GLStylizer(true));
}
for (int i = 0; i < num_images; i++) {
auto input_image = input[i];
auto output_image = (*output_images)[i];
glStylizer_->stylize(input_image, output_image, mean_data, 0);
output_image->textures[0]->map_read([&](const void* buffer,
size_t width,
size_t height,
size_t stride,
size_t channels,
const GLTexture::Type& type) {
if (width == stride) {
memcpy(output_buffer, buffer, channels * width * height);
} else {
typedef uint8_t(input_data_t)[height][stride][channels];
typedef uint8_t(output_data_t)[height][width][channels];
const input_data_t& input_data = *reinterpret_cast<const input_data_t*>(buffer);
output_data_t& output_data = *reinterpret_cast<output_data_t*>(output_buffer);
for (int y = 0; y < height; y++) {
memcpy(output_data[y], input_data[y], channels * width);
}
}
});
}
delete output_images;
return true;
}
private:
std::unique_ptr<GLStylizer> glStylizer_;
};
template <InputFormat inputFormat>
class OpenGLTextureToTextureStylizerDeprocessOp : public Operator<CPUContext>,
ImageAllocator<uint8_t> {
public:
using Operator<CPUContext>::Operator;
// Expect this many channels as input
static constexpr int kInputChannels = 3;
// Expect this many channels as output
static constexpr int kOutputChannels = 4;
bool RunOnDevice() {
const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
const auto& mean = Input(1);
const int num_images = input.size(), channels = input.channels(), height = input.height(),
width = input.width();
CAFFE_ENFORCE(mean.size() == kInputChannels);
CAFFE_ENFORCE(channels == kInputChannels);
const auto* mean_data = mean.template data<float>();
// Use foreignTextureAllocator inside GLContext
// glDeleteTexture will not be called from inside caffe2 for this texture
GLImageVector<uint8_t>* output_images;
auto textureAllocator = GLContext::getGLContext()->getTextureAllocator();
const int tile_x = 1, tile_y = 1;
if (textureAllocator != nullptr) {
output_images = ImageAllocator<uint8_t>::newImage(
num_images, width, height, kOutputChannels, tile_x, tile_y, textureAllocator);
} else {
// fallback when textureAllocator is not set
output_images = ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels);
}
if (!glStylizer_) {
glStylizer_.reset(new GLStylizer(true, inputFormat));
}
for (int i = 0; i < num_images; i++) {
auto input_image = input[i];
auto output_image = (*output_images)[i];
glStylizer_->stylize(input_image, output_image, mean_data, 0);
}
Outputs()[0]->Reset(output_images);
return true;
}
private:
std::unique_ptr<GLStylizer> glStylizer_;
};
REGISTER_CPU_OPERATOR(OpenGLTextureToTensorStylizerDeprocess,
OpenGLTextureToTensorStylizerDeprocessOp);
OPERATOR_SCHEMA(OpenGLTextureToTensorStylizerDeprocess).NumInputs(2).NumOutputs(1);
REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerDeprocess,
OpenGLTextureToTextureStylizerDeprocessOp<RGBA>);
OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerDeprocess).NumInputs(2).NumOutputs(1);
} // namespace caffe2

View File

@ -1,133 +0,0 @@
#include "../core/GLFilter.h"
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLSub : public GLFilter {
public:
binding* inputData[2];
binding* outputSize;
GLSub()
: GLFilter("GLSub",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
{/* no uniform blocks */},
{/* no attributes */},
{/* no replacements */}) {}
template <typename T>
void sub(const GLImageVector<T>& input_image0,
const GLImageVector<T>& input_image1,
const GLImageVector<T>& output_image);
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLSub::fragment_shader = R"GLSL(#version 300 es
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData[2]);
TEXTURE_OUTPUT(0, outputData);
void main() {
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
vec4 value = A - B;
outputData = TEXTURE_STORE(value);}
)GLSL";
template <typename T>
void GLSub::sub(const GLImageVector<T>& input_images0,
const GLImageVector<T>& input_images1,
const GLImageVector<T>& output_images) {
const int num_images = input_images0.size();
for (int i = 0; i < num_images; i++) {
GLImage<T>* input_image0 = input_images0[i];
GLImage<T>* input_image1 = input_images1[i];
int input_slices = input_image0->slices;
GLImage<T>* output_image = output_images[i];
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments;
input_attachments.push_back({input_image0->textures[is], inputData[0]});
input_attachments.push_back({input_image1->textures[is], inputData[1]});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
output_image->width,
output_image->height);
}
}
}
namespace caffe2 {
template <typename T>
class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
}
bool RunOnDevice() override {
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
CAFFE_ENFORCE_EQ(input0.size(), input1.size());
const int num_images = input0.size();
const int input_channels = input0.channels();
const int input_width = input0.width();
const int input_height = input0.height();
CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
CAFFE_ENFORCE_EQ(input1.width(), input_width);
CAFFE_ENFORCE_EQ(input1.height(), input_height);
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
if (!_sub) {
_sub.reset(new GLSub());
}
_sub->sub(input0, input1, *output);
Outputs()[0]->Reset(output);
return true;
}
private:
std::unique_ptr<GLSub> _sub;
};
REGISTER_CPU_OPERATOR(OpenGLSub, OpenGLSubOp<float16_t>);
OPERATOR_SCHEMA(OpenGLSub).NumInputs(2).NumOutputs(1);
} // namespace caffe2

View File

@ -1,33 +0,0 @@
#pragma once
#include <cmath>
struct point {
int x;
int y;
};
struct tile_descriptor {
point tile_dims;
point tile_size;
int tiles;
};
namespace caffe2 {
inline static void squareFactors(int N, int& r1, int& r2) {
int f = sqrt(N);
if (f * f == N) {
r1 = r2 = f;
} else {
while (N % f != 0) {
f--;
}
r1 = N / f;
r2 = f;
}
}
inline static void computeOutputTiles(int output_channels, int& output_tile_x, int& output_tile_y) {
squareFactors((output_channels + 3) / 4, output_tile_x, output_tile_y);
}
} // namespace caffe2

View File

@ -1,381 +0,0 @@
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "caffe2/core/workspace.h"
#include "caffe2/utils/math.h"
#include "../core/GL.h"
#include "../core/GLLogging.h"
#include "../core/arm_neon_support.h"
#include "../operators/gl_tiling_utils.h"
#include "TestGLConvolution.h"
#include <vector>
void AddNoiseInput(const std::vector<int64_t>& shape,
const std::string& name,
caffe2::Workspace* ws) {
caffe2::CPUContext context;
caffe2::Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<caffe2::TensorCPU>();
tensor->Resize(shape);
caffe2::math::RandGaussian<float, caffe2::CPUContext>(
tensor->size(), 0.0f, 10.0f, tensor->mutable_data<float>(), &context);
}
double BenchOp(const std::string& typ,
int inputC,
int outputC,
int kW,
int kH,
int stride,
int inW,
int inH,
bool transposed,
caffe2::Workspace* ws = nullptr) {
caffe2::Workspace localWs;
if (!ws) {
ws = &localWs;
}
const char* engine = transposed ? "MOBILE" : "NNPACK";
caffe2::OperatorDef def1;
def1.set_name("test");
def1.set_type(typ);
def1.set_engine(engine);
def1.add_input("X");
def1.add_input("W");
def1.add_input("B");
def1.add_output("Y");
def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_h", kH));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_w", kW));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_h", stride));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_w", stride));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_t", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
AddNoiseInput(std::vector<int64_t>{1, inputC, inH, inW}, "X", ws);
if (transposed) {
AddNoiseInput(std::vector<int64_t>{inputC, outputC, kH, kW}, "W", ws);
} else {
AddNoiseInput(std::vector<int64_t>{outputC, inputC, kH, kW}, "W", ws);
}
AddNoiseInput(std::vector<int64_t>{outputC}, "B", ws);
std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
// Measure one iteration
caffe2::Timer timer;
timer.Start();
op1->Run();
float one_iteration = timer.MilliSeconds();
int target_iterations = std::max((int)(1000 / one_iteration), 1);
int warmup_iterations = std::max((int)(200 / one_iteration), 1);
// warm up
for (int i = 0; i < warmup_iterations; i++) {
op1->Run();
}
timer.Start();
int runs = target_iterations;
for (int i = 0; i < runs; i++) {
op1->Run();
}
auto total_t = timer.MilliSeconds();
gl_log(GL_LOG,
"%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
typ.c_str(),
inputC,
outputC,
inW,
inH,
kW,
kH,
engine,
timer.MilliSeconds() / (float)runs);
return double(total_t) / runs;
}
template <typename T>
static double BenchGLConvolution(int input_channels,
int output_channels,
int kernel_width,
int kernel_height,
int input_width,
int input_height,
int input_padding,
int input_stride,
bool transposed,
caffe2::Workspace* ws = nullptr) {
int tile_x = 1, tile_y = 1;
caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
gl_log(GL_LOG, "Input Tiles Factors: %d, %d\n", tile_x, tile_y);
caffe2::Workspace localWs;
if (!ws) {
ws = &localWs;
}
AddNoiseInput(
std::vector<int64_t>{1, input_channels, input_height, input_width}, "X_cpu", ws);
if (transposed) {
AddNoiseInput(
std::vector<int64_t>{input_channels, output_channels, kernel_height, kernel_width},
"W",
ws);
} else {
AddNoiseInput(
std::vector<int64_t>{output_channels, input_channels, kernel_height, kernel_width},
"W",
ws);
}
AddNoiseInput(std::vector<int64_t>{output_channels}, "b", ws);
caffe2::NetDef netdef;
{
auto& op = *(netdef.add_op());
op.set_type("CopyToOpenGL");
op.add_input("X_cpu");
op.add_output("X_gl");
{
auto& arg = *(op.add_arg());
arg.set_name("tile_x");
arg.set_i(tile_x);
}
{
auto& arg = *(op.add_arg());
arg.set_name("tile_y");
arg.set_i(tile_y);
}
}
{
auto& op = *(netdef.add_op());
op.set_type(transposed ? "OpenGLConvTranspose" : "OpenGLConv");
op.add_input("X_gl");
{
op.add_input("W");
op.add_input("b");
}
{
auto& arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
}
{
auto& arg = *(op.add_arg());
arg.set_name("kernel");
arg.set_i(kernel_height);
}
{
auto& arg = *(op.add_arg());
arg.set_name("pad");
arg.set_i(input_padding);
}
{
auto& arg = *(op.add_arg());
arg.set_name("stride");
arg.set_i(input_stride);
}
{
auto& arg = *(op.add_arg());
arg.set_name("is_last");
arg.set_i(1);
}
op.add_output("Y_gl");
}
std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
for (auto& op : netdef.op()) {
ops.push_back(CreateOperator(op, ws));
}
// Run the Copy Operator
ops[0]->Run();
// Make sure the tested operator is precompiled
ops[1]->Run();
glFinish();
// Measure one iteration
caffe2::Timer timer;
timer.Start();
ops[1]->Run();
glFinish();
float one_iteration = timer.MilliSeconds();
int target_iterations = std::max((int)(1000 / one_iteration), 1);
int warmup_iterations = std::max((int)(200 / one_iteration), 1);
// warm up
for (int i = 0; i < warmup_iterations; i++) {
ops[1]->Run();
}
glFinish();
timer.Start();
int runs = target_iterations;
for (int i = 0; i < runs; i++) {
ops[1]->Run();
}
glFinish();
const double gpuIterTime = double(timer.MilliSeconds()) / runs;
gl_log(GL_LOG,
"%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
transposed ? "ConvTranspose" : "Conv",
input_channels,
output_channels,
input_width,
input_height,
kernel_width,
kernel_height,
gpuIterTime);
return gpuIterTime;
}
void TestGLConvolution() {
caffe2::Workspace ws;
ws.GetThreadPool()->setMinWorkSize(0);
// small input sizes
// std::vector<int> sizes({14, 26, 52, 104});
// std::vector<int> channels({128, 64}); // not working for 512 and 256 channels yet
// std::vector<int> channels({512, 256, 128, 64});
// large input sizes
// std::vector<int> sizes({208, 312, 416, 720, 1080});
// std::vector<int> channels({16, 4});
//
std::vector<int> sizes({14, 26, 52, 104, 208});
// std::vector<int> channels({24, 16, 4});
// std::vector<int> sizes({14});
std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
std::vector<int> kernels({3});
bool transposed = false;
int stride = 1;
for (const auto& space : sizes) {
for (const auto& input_channel : channels) {
int output_channel = input_channel;
/* for (const auto& output_channel : channels) */ {
for (const auto& kernel : kernels) {
const double gpuIterTime = BenchGLConvolution<float16_t>(
input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
const double cpuIterTime = BenchOp(transposed ? "ConvTranspose" : "Conv",
input_channel,
output_channel,
kernel,
kernel,
stride,
space,
space,
transposed,
&ws);
const double flops = double(input_channel) * output_channel * kernel * kernel *
(kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
// gl_log(GL_LOG,
printf(
"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
"%.2f\tratio: "
"%.2f\n",
space,
space,
input_channel,
output_channel,
kernel,
kernel,
flops / gpuIterTime / 1E6,
flops / cpuIterTime / 1E6,
cpuIterTime / gpuIterTime);
}
}
}
}
// // ConvTranspose
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 2, true);
// BenchGLConvolution<float16_t>(16, 16, 4, 4, 640, 360, 0, 2, true);
// BenchGLConvolution<float16_t>(16, 16, 5, 5, 640, 360, 0, 2, true);
// BenchGLConvolution<float16_t>(16, 16, 6, 6, 640, 360, 0, 2, true);
// BenchGLConvolution<float16_t>(16, 16, 7, 7, 640, 360, 0, 2, true);
// BenchGLConvolution<float16_t>(16, 16, 8, 8, 640, 360, 0, 2, true);
// BenchGLConvolution<float16_t>(16, 16, 9, 9, 640, 360, 0, 2, true);
//
// BenchOp("ConvTranspose", 16, 16, 3, 3, 2, 640, 360, true);
// BenchOp("ConvTranspose", 16, 16, 4, 4, 2, 640, 360, true);
// BenchOp("ConvTranspose", 16, 16, 5, 5, 2, 640, 360, true);
// BenchOp("ConvTranspose", 16, 16, 6, 6, 2, 640, 360, true);
// BenchOp("ConvTranspose", 16, 16, 7, 7, 2, 640, 360, true);
// BenchOp("ConvTranspose", 16, 16, 8, 8, 2, 640, 360, true);
// BenchOp("ConvTranspose", 16, 16, 9, 9, 2, 640, 360, true);
//
// // Conv
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 4, 4, 1280, 720, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 5, 5, 1280, 720, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 6, 6, 1280, 720, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 7, 7, 1280, 720, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 8, 8, 1280, 720, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 9, 9, 1280, 720, 0, 1, false);
//
// BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
// BenchOp("Conv", 16, 16, 4, 4, 1, 1280, 720, false);
// BenchOp("Conv", 16, 16, 5, 5, 1, 1280, 720, false);
// BenchOp("Conv", 16, 16, 6, 6, 1, 1280, 720, false);
// BenchOp("Conv", 16, 16, 7, 7, 1, 1280, 720, false);
// BenchOp("Conv", 16, 16, 8, 8, 1, 1280, 720, false);
// BenchOp("Conv", 16, 16, 9, 9, 1, 1280, 720, false);
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 80, 45, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 160, 90, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 320, 180, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 1, false);
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
//
// BenchOp("Conv", 16, 16, 3, 3, 1, 80, 45, false);
// BenchOp("Conv", 16, 16, 3, 3, 1, 160, 90, false);
// BenchOp("Conv", 16, 16, 3, 3, 1, 320, 180, false);
// BenchOp("Conv", 16, 16, 3, 3, 1, 640, 360, false);
// BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
//
// BenchGLConvolution<float16_t>(128, 128, 3, 3, 14, 14, 0, 1, false);
// BenchGLConvolution<float16_t>(256, 256, 3, 3, 14, 14, 0, 1, false);
// BenchGLConvolution<float16_t>(128, 128, 3, 3, 28, 28, 0, 1, false);
// BenchGLConvolution<float16_t>(256, 256, 3, 3, 28, 28, 0, 1, false);
// BenchGLConvolution<float16_t>(128, 128, 3, 3, 56, 56, 0, 1, false);
// BenchGLConvolution<float16_t>(256, 256, 3, 3, 56, 56, 0, 1, false);
// BenchGLConvolution<float16_t>(64, 64, 7, 7, 128, 128, 0, 1, false);
//
// BenchOp("Conv", 128, 128, 3, 3, 1, 14, 14, false);
// BenchOp("Conv", 256, 256, 3, 3, 1, 14, 14, false);
// BenchOp("Conv", 128, 128, 3, 3, 1, 28, 28, false);
// BenchOp("Conv", 256, 256, 3, 3, 1, 28, 28, false);
// BenchOp("Conv", 128, 128, 3, 3, 1, 56, 56, false);
// BenchOp("Conv", 256, 256, 3, 3, 1, 56, 56, false);
// BenchOp("Conv", 64, 64, 7, 7, 1, 128, 128, false);
}

View File

@ -1,4 +0,0 @@
#pragma once
void TestGLConvolution();

File diff suppressed because it is too large Load Diff

View File

@ -1,38 +0,0 @@
#include "caffe2/proto/caffe2_pb.h"
namespace caffe2 {
void testOpenGL();
void compareModelsForOpenGL(std::string name,
const NetDef& initNet,
NetDef predictNet,
int width,
int height,
int channel,
std::string input_type,
std::string input_order);
void compareBatchedToTiledModels(std::string name,
const NetDef& initNet,
NetDef predictNet,
int width,
int height,
int channel,
std::string input_type,
std::string input_order);
int runModelBenchmarks(caffe2::NetDef& init_net,
caffe2::NetDef& predict_net,
int warm_up_runs,
int main_runs,
int channel,
int height,
int width,
std::string input_type,
std::string input_order,
std::string engine,
bool run_individual = false,
bool use_texture_input = false,
bool use_tiling = false,
bool run_fusion = true);
} // namespace caffe2

View File

@ -834,23 +834,13 @@ if(USE_PROF)
endif()
endif()
if (USE_MOBILE_OPENGL)
if (ANDROID)
list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
elseif (IOS)
message(STATUS "TODO item for adding ios opengl dependency")
else()
message(WARNING "mobile opengl is only used in android or ios builds.")
caffe2_update_option(USE_MOBILE_OPENGL OFF)
endif()
endif()
# ---[ ARM Compute Library: check compatibility.
if (USE_ACL)
if (NOT ANDROID)
message(WARNING "ARM Compute Library is only supported for Android builds.")
caffe2_update_option(USE_ACL OFF)
else()
list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
set(ACL_ARCH "armv7a")

View File

@ -98,7 +98,6 @@ function (caffe2_print_configuration_summary)
message(STATUS " USE_METAL : ${USE_METAL}")
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
message(STATUS " USE_MKLDNN : ${CAFFE2_USE_MKLDNN}")
message(STATUS " USE_MOBILE_OPENGL : ${USE_MOBILE_OPENGL}")
message(STATUS " USE_NCCL : ${USE_NCCL}")
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")

View File

@ -85,9 +85,6 @@ CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK")
CMAKE_ARGS+=("-DANDROID_ABI=armeabi-v7a with NEON")
CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=21")
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
# TODO: As the toolchain file doesn't support NEON-FP16 extension,
# we disable USE_MOBILE_OPENGL for now, it will be re-enabled in the future.
CMAKE_ARGS+=("-DUSE_MOBILE_OPENGL=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
CMAKE_ARGS+=($@)