mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
caffe2 mobile opengl (#15322)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/15322 caffe2 mobile opengl code is not used, deleting it to reduce complications when we perform other changes Reviewed By: Maratyszcza Differential Revision: D13499943 fbshipit-source-id: 6479f6b9f50f08b5ae28f8f0bc4a1c4fc3f3c3c2
This commit is contained in:
committed by
Facebook Github Bot
parent
54d8ce94ee
commit
12cf5178aa
@ -92,7 +92,6 @@ option(USE_LEVELDB "Use LEVELDB" ON)
|
||||
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
|
||||
option(USE_LMDB "Use LMDB" ON)
|
||||
option(USE_METAL "Use Metal for iOS build" ON)
|
||||
option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
option(USE_NCCL "Use NCCL" ON)
|
||||
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
|
||||
|
@ -1,8 +1,4 @@
|
||||
add_subdirectory(ios)
|
||||
# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
|
||||
if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
|
||||
# add_subdirectory(opengl)
|
||||
endif()
|
||||
if (USE_ACL)
|
||||
# add_subdirectory(arm-compute)
|
||||
endif()
|
||||
|
@ -1,11 +0,0 @@
|
||||
add_subdirectory(core)
|
||||
add_subdirectory(operators)
|
||||
|
||||
if (ANDROID)
|
||||
add_subdirectory(android)
|
||||
endif()
|
||||
|
||||
if (IOS)
|
||||
add_subdirectory(ios)
|
||||
endif()
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
@ -1,156 +0,0 @@
|
||||
#include "AndroidGLContext.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "gl3stub.h"
|
||||
#include <regex>
|
||||
|
||||
namespace {
|
||||
|
||||
static const std::unordered_map<std::string, GL_Renderer>& renderer_map() {
|
||||
static std::unordered_map<std::string, GL_Renderer> m = {
|
||||
{"Adreno", Adreno},
|
||||
{"Mali", Mali},
|
||||
{"NVIDIA", Tegra} /*, {"PowerVR", PowerVR} */};
|
||||
return m;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
EGLContext AndroidGLContext::create_opengl_thread_context() {
|
||||
EGLSurface surface = EGL_NO_SURFACE;
|
||||
EGLContext context = EGL_NO_CONTEXT;
|
||||
EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
|
||||
if (display == EGL_NO_DISPLAY) {
|
||||
// We failed to get a display
|
||||
CAFFE_THROW("Problem with OpenGL context");
|
||||
return context;
|
||||
}
|
||||
|
||||
EGLint major;
|
||||
EGLint minor;
|
||||
eglInitialize(display, &major, &minor);
|
||||
|
||||
const EGLint configAttr[] = {EGL_RENDERABLE_TYPE,
|
||||
EGL_OPENGL_ES2_BIT,
|
||||
EGL_SURFACE_TYPE,
|
||||
EGL_PBUFFER_BIT, // we create a pixelbuffer surface
|
||||
EGL_NONE};
|
||||
|
||||
EGLint numConfig;
|
||||
EGLConfig eglConfig;
|
||||
if (!eglChooseConfig(display, configAttr, &eglConfig, 1, &numConfig)) {
|
||||
// We failed to find a suitable config
|
||||
eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
|
||||
eglTerminate(display);
|
||||
display = EGL_NO_DISPLAY;
|
||||
CAFFE_THROW("Problem with OpenGL context");
|
||||
return context;
|
||||
}
|
||||
|
||||
const EGLint ctxAttr[] = {EGL_CONTEXT_CLIENT_VERSION,
|
||||
2, // very important!
|
||||
EGL_NONE};
|
||||
|
||||
// Create an EGL context based on the chosen configuration.
|
||||
context = eglCreateContext(display, eglConfig, EGL_NO_CONTEXT, ctxAttr);
|
||||
|
||||
// We need a surface. For most mixed JNI/Java based apps it is suggested
|
||||
// that we pass a Java surface through JNI and extract the surface
|
||||
// Pure NDK apps get passed the android_app structure which includes a surface
|
||||
// We want our own OpenGL context for the current thread.
|
||||
// Here we create a fake 1x1 'pixel buffer' surface.
|
||||
// We don't expecting to run vertex or fragment shaders.
|
||||
|
||||
const EGLint surfaceAttr[] = {EGL_WIDTH, 1, EGL_HEIGHT, 1, EGL_NONE};
|
||||
|
||||
surface = eglCreatePbufferSurface(display, eglConfig, surfaceAttr);
|
||||
|
||||
// Bind context, draw and surface to current thread
|
||||
eglMakeCurrent(display, surface, surface, context);
|
||||
|
||||
// Bind the API for this context. In our case we want to use OpenGL_ES
|
||||
eglBindAPI(EGL_OPENGL_ES_API);
|
||||
return context;
|
||||
}
|
||||
|
||||
bool AndroidGLContext::opengl_thread_context_exists() {
|
||||
return eglGetCurrentContext() != EGL_NO_CONTEXT;
|
||||
}
|
||||
|
||||
bool AndroidGLContext::release_opengl_thread_context() {
|
||||
EGLContext display = eglGetCurrentDisplay();
|
||||
if (display != EGL_NO_DISPLAY) {
|
||||
if (_eglcontext != EGL_NO_CONTEXT) {
|
||||
eglDestroyContext(display, _eglcontext);
|
||||
_eglcontext = EGL_NO_CONTEXT;
|
||||
}
|
||||
EGLSurface surface = eglGetCurrentSurface(EGL_DRAW);
|
||||
if (surface != EGL_NO_SURFACE) {
|
||||
eglDestroySurface(display, surface);
|
||||
surface = EGL_NO_SURFACE;
|
||||
}
|
||||
surface = eglGetCurrentSurface(EGL_READ);
|
||||
if (surface != EGL_NO_SURFACE) {
|
||||
eglDestroySurface(display, surface);
|
||||
surface = EGL_NO_SURFACE;
|
||||
}
|
||||
eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
|
||||
eglTerminate(display);
|
||||
display = EGL_NO_DISPLAY;
|
||||
}
|
||||
eglReleaseThread();
|
||||
return true;
|
||||
}
|
||||
|
||||
void AndroidGLContext::init_gles3() {
|
||||
if (!gl3stubInit()) {
|
||||
CAFFE_THROW("OpenGL ES 3 not initialized");
|
||||
} else {
|
||||
LOG(INFO) << "OpenGL ES 3 successfully enabled";
|
||||
}
|
||||
}
|
||||
|
||||
GL_Renderer AndroidGLContext::get_platform() {
|
||||
std::string rendererStr((const char*)glGetString(GL_RENDERER));
|
||||
std::regex regStr("^[A-Za-z]*");
|
||||
std::smatch matchs;
|
||||
if (std::regex_search(rendererStr, matchs, regStr)) {
|
||||
const std::string renderer = *matchs.begin();
|
||||
auto found = renderer_map().find(renderer);
|
||||
if (found != renderer_map().end()) {
|
||||
return found->second;
|
||||
}
|
||||
}
|
||||
CAFFE_THROW("Unsupported GPU renderer");
|
||||
}
|
||||
|
||||
AndroidGLContext::AndroidGLContext() {
|
||||
if (!opengl_thread_context_exists()) {
|
||||
_eglcontext = create_opengl_thread_context();
|
||||
LOG(INFO) << "New EGLContext created";
|
||||
|
||||
if (!supportOpenGLES3(&half_float_supported)) {
|
||||
CAFFE_THROW("OpenGL ES 3 not supported");
|
||||
}
|
||||
|
||||
if (!isSupportedDevice()) {
|
||||
LOG(ERROR) << "Device not fully supported";
|
||||
}
|
||||
} else {
|
||||
_eglcontext = EGL_NO_CONTEXT;
|
||||
LOG(INFO) << "Reusing EGLContext, make sure OpenGL ES 3 is supported";
|
||||
}
|
||||
static std::once_flag once;
|
||||
std::call_once(once, [&]() { init_gles3(); });
|
||||
}
|
||||
|
||||
AndroidGLContext::~AndroidGLContext() {
|
||||
if (_eglcontext != EGL_NO_CONTEXT) {
|
||||
release_opengl_thread_context();
|
||||
}
|
||||
}
|
||||
|
||||
void AndroidGLContext::set_context() {}
|
||||
|
||||
void AndroidGLContext::reset_context() {}
|
||||
|
||||
void AndroidGLContext::flush_context() {}
|
@ -1,26 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../core/GLContext.h"
|
||||
#include "../core/GLTexture.h"
|
||||
#include <unordered_map>
|
||||
|
||||
enum GL_Renderer { Adreno, Mali, Tegra /*, PowerVR */ };
|
||||
|
||||
class AndroidGLContext : public GLContext {
|
||||
private:
|
||||
EGLContext _eglcontext;
|
||||
|
||||
EGLContext create_opengl_thread_context();
|
||||
bool opengl_thread_context_exists();
|
||||
bool release_opengl_thread_context();
|
||||
|
||||
public:
|
||||
AndroidGLContext();
|
||||
~AndroidGLContext();
|
||||
void set_context();
|
||||
void reset_context();
|
||||
void flush_context();
|
||||
void init_gles3();
|
||||
GL_Renderer get_platform();
|
||||
};
|
@ -1,2 +0,0 @@
|
||||
file(GLOB_RECURSE tmp *.cc *.c)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,19 +0,0 @@
|
||||
|
||||
#include "AndroidGLContext.h"
|
||||
|
||||
std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
|
||||
|
||||
void GLContext::initGLContext() {
|
||||
if (_glcontext == nullptr) {
|
||||
_glcontext.reset(new AndroidGLContext());
|
||||
}
|
||||
}
|
||||
|
||||
GLContext* GLContext::getGLContext() {
|
||||
if (_glcontext == nullptr) {
|
||||
initGLContext();
|
||||
}
|
||||
return _glcontext.get();
|
||||
}
|
||||
|
||||
void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
|
@ -1,11 +0,0 @@
|
||||
|
||||
#include "../core/GLImageAllocator.h"
|
||||
#include "../core/arm_neon_support.h"
|
||||
|
||||
template <typename T>
|
||||
GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
|
||||
return new GLImageAllocator<T>();
|
||||
}
|
||||
|
||||
template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
|
||||
template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
|
@ -1,5 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <arm_neon.h>
|
||||
typedef __fp16 float16_t;
|
@ -1,357 +0,0 @@
|
||||
|
||||
// clang-format off
|
||||
|
||||
#include <EGL/egl.h>
|
||||
#include "gl3stub.h"
|
||||
|
||||
GLboolean gl3stubInit() {
|
||||
#define FIND_PROC(s) s = (void*)eglGetProcAddress(#s)
|
||||
FIND_PROC(glReadBuffer);
|
||||
FIND_PROC(glDrawRangeElements);
|
||||
FIND_PROC(glTexImage3D);
|
||||
FIND_PROC(glTexSubImage3D);
|
||||
FIND_PROC(glCopyTexSubImage3D);
|
||||
FIND_PROC(glCompressedTexImage3D);
|
||||
FIND_PROC(glCompressedTexSubImage3D);
|
||||
FIND_PROC(glGenQueries);
|
||||
FIND_PROC(glDeleteQueries);
|
||||
FIND_PROC(glIsQuery);
|
||||
FIND_PROC(glBeginQuery);
|
||||
FIND_PROC(glEndQuery);
|
||||
FIND_PROC(glGetQueryiv);
|
||||
FIND_PROC(glGetQueryObjectuiv);
|
||||
FIND_PROC(glUnmapBuffer);
|
||||
FIND_PROC(glGetBufferPointerv);
|
||||
FIND_PROC(glDrawBuffers);
|
||||
FIND_PROC(glUniformMatrix2x3fv);
|
||||
FIND_PROC(glUniformMatrix3x2fv);
|
||||
FIND_PROC(glUniformMatrix2x4fv);
|
||||
FIND_PROC(glUniformMatrix4x2fv);
|
||||
FIND_PROC(glUniformMatrix3x4fv);
|
||||
FIND_PROC(glUniformMatrix4x3fv);
|
||||
FIND_PROC(glBlitFramebuffer);
|
||||
FIND_PROC(glRenderbufferStorageMultisample);
|
||||
FIND_PROC(glFramebufferTextureLayer);
|
||||
FIND_PROC(glMapBufferRange);
|
||||
FIND_PROC(glFlushMappedBufferRange);
|
||||
FIND_PROC(glBindVertexArray);
|
||||
FIND_PROC(glDeleteVertexArrays);
|
||||
FIND_PROC(glGenVertexArrays);
|
||||
FIND_PROC(glIsVertexArray);
|
||||
FIND_PROC(glGetIntegeri_v);
|
||||
FIND_PROC(glBeginTransformFeedback);
|
||||
FIND_PROC(glEndTransformFeedback);
|
||||
FIND_PROC(glBindBufferRange);
|
||||
FIND_PROC(glBindBufferBase);
|
||||
FIND_PROC(glTransformFeedbackVaryings);
|
||||
FIND_PROC(glGetTransformFeedbackVarying);
|
||||
FIND_PROC(glVertexAttribIPointer);
|
||||
FIND_PROC(glGetVertexAttribIiv);
|
||||
FIND_PROC(glGetVertexAttribIuiv);
|
||||
FIND_PROC(glVertexAttribI4i);
|
||||
FIND_PROC(glVertexAttribI4ui);
|
||||
FIND_PROC(glVertexAttribI4iv);
|
||||
FIND_PROC(glVertexAttribI4uiv);
|
||||
FIND_PROC(glGetUniformuiv);
|
||||
FIND_PROC(glGetFragDataLocation);
|
||||
FIND_PROC(glUniform1ui);
|
||||
FIND_PROC(glUniform2ui);
|
||||
FIND_PROC(glUniform3ui);
|
||||
FIND_PROC(glUniform4ui);
|
||||
FIND_PROC(glUniform1uiv);
|
||||
FIND_PROC(glUniform2uiv);
|
||||
FIND_PROC(glUniform3uiv);
|
||||
FIND_PROC(glUniform4uiv);
|
||||
FIND_PROC(glClearBufferiv);
|
||||
FIND_PROC(glClearBufferuiv);
|
||||
FIND_PROC(glClearBufferfv);
|
||||
FIND_PROC(glClearBufferfi);
|
||||
FIND_PROC(glGetStringi);
|
||||
FIND_PROC(glCopyBufferSubData);
|
||||
FIND_PROC(glGetUniformIndices);
|
||||
FIND_PROC(glGetActiveUniformsiv);
|
||||
FIND_PROC(glGetUniformBlockIndex);
|
||||
FIND_PROC(glGetActiveUniformBlockiv);
|
||||
FIND_PROC(glGetActiveUniformBlockName);
|
||||
FIND_PROC(glUniformBlockBinding);
|
||||
FIND_PROC(glDrawArraysInstanced);
|
||||
FIND_PROC(glDrawElementsInstanced);
|
||||
FIND_PROC(glFenceSync);
|
||||
FIND_PROC(glIsSync);
|
||||
FIND_PROC(glDeleteSync);
|
||||
FIND_PROC(glClientWaitSync);
|
||||
FIND_PROC(glWaitSync);
|
||||
FIND_PROC(glGetInteger64v);
|
||||
FIND_PROC(glGetSynciv);
|
||||
FIND_PROC(glGetInteger64i_v);
|
||||
FIND_PROC(glGetBufferParameteri64v);
|
||||
FIND_PROC(glGenSamplers);
|
||||
FIND_PROC(glDeleteSamplers);
|
||||
FIND_PROC(glIsSampler);
|
||||
FIND_PROC(glBindSampler);
|
||||
FIND_PROC(glSamplerParameteri);
|
||||
FIND_PROC(glSamplerParameteriv);
|
||||
FIND_PROC(glSamplerParameterf);
|
||||
FIND_PROC(glSamplerParameterfv);
|
||||
FIND_PROC(glGetSamplerParameteriv);
|
||||
FIND_PROC(glGetSamplerParameterfv);
|
||||
FIND_PROC(glVertexAttribDivisor);
|
||||
FIND_PROC(glBindTransformFeedback);
|
||||
FIND_PROC(glDeleteTransformFeedbacks);
|
||||
FIND_PROC(glGenTransformFeedbacks);
|
||||
FIND_PROC(glIsTransformFeedback);
|
||||
FIND_PROC(glPauseTransformFeedback);
|
||||
FIND_PROC(glResumeTransformFeedback);
|
||||
FIND_PROC(glGetProgramBinary);
|
||||
FIND_PROC(glProgramBinary);
|
||||
FIND_PROC(glProgramParameteri);
|
||||
FIND_PROC(glInvalidateFramebuffer);
|
||||
FIND_PROC(glInvalidateSubFramebuffer);
|
||||
FIND_PROC(glTexStorage2D);
|
||||
FIND_PROC(glTexStorage3D);
|
||||
FIND_PROC(glGetInternalformativ);
|
||||
|
||||
// Bind GL_EXT_texture_border_clamp
|
||||
|
||||
FIND_PROC(glTexParameterIivEXT);
|
||||
FIND_PROC(glTexParameterIuivEXT);
|
||||
FIND_PROC(glGetTexParameterIivEXT);
|
||||
FIND_PROC(glGetTexParameterIuivEXT);
|
||||
FIND_PROC(glSamplerParameterIivEXT);
|
||||
FIND_PROC(glSamplerParameterIuivEXT);
|
||||
FIND_PROC(glGetSamplerParameterIivEXT);
|
||||
FIND_PROC(glGetSamplerParameterIuivEXT);
|
||||
|
||||
#undef FIND_PROC
|
||||
|
||||
if (!glReadBuffer ||
|
||||
!glDrawRangeElements ||
|
||||
!glTexImage3D ||
|
||||
!glTexSubImage3D ||
|
||||
!glCopyTexSubImage3D ||
|
||||
!glCompressedTexImage3D ||
|
||||
!glCompressedTexSubImage3D ||
|
||||
!glGenQueries ||
|
||||
!glDeleteQueries ||
|
||||
!glIsQuery ||
|
||||
!glBeginQuery ||
|
||||
!glEndQuery ||
|
||||
!glGetQueryiv ||
|
||||
!glGetQueryObjectuiv ||
|
||||
!glUnmapBuffer ||
|
||||
!glGetBufferPointerv ||
|
||||
!glDrawBuffers ||
|
||||
!glUniformMatrix2x3fv ||
|
||||
!glUniformMatrix3x2fv ||
|
||||
!glUniformMatrix2x4fv ||
|
||||
!glUniformMatrix4x2fv ||
|
||||
!glUniformMatrix3x4fv ||
|
||||
!glUniformMatrix4x3fv ||
|
||||
!glBlitFramebuffer ||
|
||||
!glRenderbufferStorageMultisample ||
|
||||
!glFramebufferTextureLayer ||
|
||||
!glMapBufferRange ||
|
||||
!glFlushMappedBufferRange ||
|
||||
!glBindVertexArray ||
|
||||
!glDeleteVertexArrays ||
|
||||
!glGenVertexArrays ||
|
||||
!glIsVertexArray ||
|
||||
!glGetIntegeri_v ||
|
||||
!glBeginTransformFeedback ||
|
||||
!glEndTransformFeedback ||
|
||||
!glBindBufferRange ||
|
||||
!glBindBufferBase ||
|
||||
!glTransformFeedbackVaryings ||
|
||||
!glGetTransformFeedbackVarying ||
|
||||
!glVertexAttribIPointer ||
|
||||
!glGetVertexAttribIiv ||
|
||||
!glGetVertexAttribIuiv ||
|
||||
!glVertexAttribI4i ||
|
||||
!glVertexAttribI4ui ||
|
||||
!glVertexAttribI4iv ||
|
||||
!glVertexAttribI4uiv ||
|
||||
!glGetUniformuiv ||
|
||||
!glGetFragDataLocation ||
|
||||
!glUniform1ui ||
|
||||
!glUniform2ui ||
|
||||
!glUniform3ui ||
|
||||
!glUniform4ui ||
|
||||
!glUniform1uiv ||
|
||||
!glUniform2uiv ||
|
||||
!glUniform3uiv ||
|
||||
!glUniform4uiv ||
|
||||
!glClearBufferiv ||
|
||||
!glClearBufferuiv ||
|
||||
!glClearBufferfv ||
|
||||
!glClearBufferfi ||
|
||||
!glGetStringi ||
|
||||
!glCopyBufferSubData ||
|
||||
!glGetUniformIndices ||
|
||||
!glGetActiveUniformsiv ||
|
||||
!glGetUniformBlockIndex ||
|
||||
!glGetActiveUniformBlockiv ||
|
||||
!glGetActiveUniformBlockName ||
|
||||
!glUniformBlockBinding ||
|
||||
!glDrawArraysInstanced ||
|
||||
!glDrawElementsInstanced ||
|
||||
!glFenceSync ||
|
||||
!glIsSync ||
|
||||
!glDeleteSync ||
|
||||
!glClientWaitSync ||
|
||||
!glWaitSync ||
|
||||
!glGetInteger64v ||
|
||||
!glGetSynciv ||
|
||||
!glGetInteger64i_v ||
|
||||
!glGetBufferParameteri64v ||
|
||||
!glGenSamplers ||
|
||||
!glDeleteSamplers ||
|
||||
!glIsSampler ||
|
||||
!glBindSampler ||
|
||||
!glSamplerParameteri ||
|
||||
!glSamplerParameteriv ||
|
||||
!glSamplerParameterf ||
|
||||
!glSamplerParameterfv ||
|
||||
!glGetSamplerParameteriv ||
|
||||
!glGetSamplerParameterfv ||
|
||||
!glVertexAttribDivisor ||
|
||||
!glBindTransformFeedback ||
|
||||
!glDeleteTransformFeedbacks ||
|
||||
!glGenTransformFeedbacks ||
|
||||
!glIsTransformFeedback ||
|
||||
!glPauseTransformFeedback ||
|
||||
!glResumeTransformFeedback ||
|
||||
!glGetProgramBinary ||
|
||||
!glProgramBinary ||
|
||||
!glProgramParameteri ||
|
||||
!glInvalidateFramebuffer ||
|
||||
!glInvalidateSubFramebuffer ||
|
||||
!glTexStorage2D ||
|
||||
!glTexStorage3D ||
|
||||
!glGetInternalformativ)
|
||||
{
|
||||
return GL_FALSE;
|
||||
}
|
||||
|
||||
return GL_TRUE;
|
||||
}
|
||||
|
||||
/* Function pointer definitions */
|
||||
GL_APICALL void (* GL_APIENTRY glReadBuffer) (GLenum mode);
|
||||
GL_APICALL void (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
|
||||
GL_APICALL void (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
|
||||
GL_APICALL void (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
|
||||
GL_APICALL void (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
|
||||
GL_APICALL void (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
|
||||
GL_APICALL void (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
|
||||
GL_APICALL void (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
|
||||
GL_APICALL void (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
|
||||
GL_APICALL GLboolean (* GL_APIENTRY glIsQuery) (GLuint id);
|
||||
GL_APICALL void (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
|
||||
GL_APICALL void (* GL_APIENTRY glEndQuery) (GLenum target);
|
||||
GL_APICALL void (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
|
||||
GL_APICALL GLboolean (* GL_APIENTRY glUnmapBuffer) (GLenum target);
|
||||
GL_APICALL void (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
|
||||
GL_APICALL void (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
|
||||
GL_APICALL void (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
|
||||
GL_APICALL void (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
|
||||
GL_APICALL GLvoid* (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
|
||||
GL_APICALL void (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
|
||||
GL_APICALL void (* GL_APIENTRY glBindVertexArray) (GLuint array);
|
||||
GL_APICALL void (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
|
||||
GL_APICALL void (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
|
||||
GL_APICALL GLboolean (* GL_APIENTRY glIsVertexArray) (GLuint array);
|
||||
GL_APICALL void (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
|
||||
GL_APICALL void (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
|
||||
GL_APICALL void (* GL_APIENTRY glEndTransformFeedback) (void);
|
||||
GL_APICALL void (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
|
||||
GL_APICALL void (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
|
||||
GL_APICALL void (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
|
||||
GL_APICALL void (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
|
||||
GL_APICALL void (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
|
||||
GL_APICALL void (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
|
||||
GL_APICALL void (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
|
||||
GL_APICALL void (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
|
||||
GL_APICALL void (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
|
||||
GL_APICALL void (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
|
||||
GL_APICALL void (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
|
||||
GL_APICALL GLint (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
GL_APICALL void (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
GL_APICALL void (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
|
||||
GL_APICALL void (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
|
||||
GL_APICALL void (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
|
||||
GL_APICALL void (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
|
||||
GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
|
||||
GL_APICALL void (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
|
||||
GL_APICALL void (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
|
||||
GL_APICALL void (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
|
||||
GL_APICALL GLuint (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
|
||||
GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
|
||||
GL_APICALL void (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
|
||||
GL_APICALL void (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
|
||||
GL_APICALL void (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
|
||||
GL_APICALL GLsync (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
|
||||
GL_APICALL GLboolean (* GL_APIENTRY glIsSync) (GLsync sync);
|
||||
GL_APICALL void (* GL_APIENTRY glDeleteSync) (GLsync sync);
|
||||
GL_APICALL GLenum (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
|
||||
GL_APICALL void (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
|
||||
GL_APICALL void (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
|
||||
GL_APICALL void (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
|
||||
GL_APICALL void (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
|
||||
GL_APICALL void (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
|
||||
GL_APICALL void (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
|
||||
GL_APICALL GLboolean (* GL_APIENTRY glIsSampler) (GLuint sampler);
|
||||
GL_APICALL void (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
|
||||
GL_APICALL void (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
|
||||
GL_APICALL void (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
|
||||
GL_APICALL void (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
|
||||
GL_APICALL void (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
|
||||
GL_APICALL void (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
|
||||
GL_APICALL void (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
|
||||
GL_APICALL void (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
|
||||
GL_APICALL void (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
|
||||
GL_APICALL void (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
|
||||
GL_APICALL GLboolean (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
|
||||
GL_APICALL void (* GL_APIENTRY glPauseTransformFeedback) (void);
|
||||
GL_APICALL void (* GL_APIENTRY glResumeTransformFeedback) (void);
|
||||
GL_APICALL void (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
|
||||
GL_APICALL void (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
|
||||
GL_APICALL void (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
|
||||
GL_APICALL void (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
|
||||
GL_APICALL void (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
|
||||
GL_APICALL void (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
|
||||
GL_APICALL void (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
|
||||
GL_APICALL void (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
|
||||
|
||||
// GL_EXT_texture_border_clamp
|
||||
|
||||
GL_APICALL void (* GL_APIENTRY glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
|
||||
GL_APICALL void (* GL_APIENTRY glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
|
||||
GL_APICALL void (* GL_APIENTRY glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
|
||||
GL_APICALL void (* GL_APIENTRY glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
|
||||
GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
|
||||
GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
|
||||
|
||||
// End GL_EXT_texture_border_clamp
|
||||
|
||||
// clang-format on
|
@ -1,488 +0,0 @@
|
||||
|
||||
#ifndef __gl3_h_
|
||||
#define __gl3_h_
|
||||
|
||||
/*
|
||||
* stub gl3.h for dynamic loading, based on:
|
||||
* gl3.h last updated on $Date: 2013-02-12 14:37:24 -0800 (Tue, 12 Feb 2013) $
|
||||
*
|
||||
* Changes:
|
||||
* - Added #include <GLES2/gl2.h>
|
||||
* - Removed duplicate OpenGL ES 2.0 declarations
|
||||
* - Converted OpenGL ES 3.0 function prototypes to function pointer
|
||||
* declarations
|
||||
* - Added gl3stubInit() declaration
|
||||
*/
|
||||
|
||||
#include <GLES2/gl2.h>
|
||||
#include <android/api-level.h>
|
||||
|
||||
// clang-format off
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Call this function before calling any OpenGL ES 3.0 functions. It will
|
||||
* return GL_TRUE if the OpenGL ES 3.0 was successfully initialized, GL_FALSE
|
||||
* otherwise. */
|
||||
GLboolean gl3stubInit();
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
* Data type definitions
|
||||
*-----------------------------------------------------------------------*/
|
||||
|
||||
/* OpenGL ES 3.0 */
|
||||
|
||||
typedef unsigned short GLhalf;
|
||||
#if __ANDROID_API__ <= 19
|
||||
typedef khronos_int64_t GLint64;
|
||||
typedef khronos_uint64_t GLuint64;
|
||||
typedef struct __GLsync *GLsync;
|
||||
#endif
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
* Token definitions
|
||||
*-----------------------------------------------------------------------*/
|
||||
|
||||
/* OpenGL ES core versions */
|
||||
#define GL_ES_VERSION_3_0 1
|
||||
|
||||
/* OpenGL ES 3.0 */
|
||||
|
||||
#define GL_READ_BUFFER 0x0C02
|
||||
#define GL_UNPACK_ROW_LENGTH 0x0CF2
|
||||
#define GL_UNPACK_SKIP_ROWS 0x0CF3
|
||||
#define GL_UNPACK_SKIP_PIXELS 0x0CF4
|
||||
#define GL_PACK_ROW_LENGTH 0x0D02
|
||||
#define GL_PACK_SKIP_ROWS 0x0D03
|
||||
#define GL_PACK_SKIP_PIXELS 0x0D04
|
||||
#define GL_COLOR 0x1800
|
||||
#define GL_DEPTH 0x1801
|
||||
#define GL_STENCIL 0x1802
|
||||
#define GL_RED 0x1903
|
||||
#define GL_RGB8 0x8051
|
||||
#define GL_RGBA8 0x8058
|
||||
#define GL_RGB10_A2 0x8059
|
||||
#define GL_TEXTURE_BINDING_3D 0x806A
|
||||
#define GL_UNPACK_SKIP_IMAGES 0x806D
|
||||
#define GL_UNPACK_IMAGE_HEIGHT 0x806E
|
||||
#define GL_TEXTURE_3D 0x806F
|
||||
#define GL_TEXTURE_WRAP_R 0x8072
|
||||
#define GL_MAX_3D_TEXTURE_SIZE 0x8073
|
||||
#define GL_UNSIGNED_INT_2_10_10_10_REV 0x8368
|
||||
#define GL_MAX_ELEMENTS_VERTICES 0x80E8
|
||||
#define GL_MAX_ELEMENTS_INDICES 0x80E9
|
||||
#define GL_TEXTURE_MIN_LOD 0x813A
|
||||
#define GL_TEXTURE_MAX_LOD 0x813B
|
||||
#define GL_TEXTURE_BASE_LEVEL 0x813C
|
||||
#define GL_TEXTURE_MAX_LEVEL 0x813D
|
||||
#define GL_MIN 0x8007
|
||||
#define GL_MAX 0x8008
|
||||
#define GL_DEPTH_COMPONENT24 0x81A6
|
||||
#define GL_MAX_TEXTURE_LOD_BIAS 0x84FD
|
||||
#define GL_TEXTURE_COMPARE_MODE 0x884C
|
||||
#define GL_TEXTURE_COMPARE_FUNC 0x884D
|
||||
#define GL_CURRENT_QUERY 0x8865
|
||||
#define GL_QUERY_RESULT 0x8866
|
||||
#define GL_QUERY_RESULT_AVAILABLE 0x8867
|
||||
#define GL_BUFFER_MAPPED 0x88BC
|
||||
#define GL_BUFFER_MAP_POINTER 0x88BD
|
||||
#define GL_STREAM_READ 0x88E1
|
||||
#define GL_STREAM_COPY 0x88E2
|
||||
#define GL_STATIC_READ 0x88E5
|
||||
#define GL_STATIC_COPY 0x88E6
|
||||
#define GL_DYNAMIC_READ 0x88E9
|
||||
#define GL_DYNAMIC_COPY 0x88EA
|
||||
#define GL_MAX_DRAW_BUFFERS 0x8824
|
||||
#define GL_DRAW_BUFFER0 0x8825
|
||||
#define GL_DRAW_BUFFER1 0x8826
|
||||
#define GL_DRAW_BUFFER2 0x8827
|
||||
#define GL_DRAW_BUFFER3 0x8828
|
||||
#define GL_DRAW_BUFFER4 0x8829
|
||||
#define GL_DRAW_BUFFER5 0x882A
|
||||
#define GL_DRAW_BUFFER6 0x882B
|
||||
#define GL_DRAW_BUFFER7 0x882C
|
||||
#define GL_DRAW_BUFFER8 0x882D
|
||||
#define GL_DRAW_BUFFER9 0x882E
|
||||
#define GL_DRAW_BUFFER10 0x882F
|
||||
#define GL_DRAW_BUFFER11 0x8830
|
||||
#define GL_DRAW_BUFFER12 0x8831
|
||||
#define GL_DRAW_BUFFER13 0x8832
|
||||
#define GL_DRAW_BUFFER14 0x8833
|
||||
#define GL_DRAW_BUFFER15 0x8834
|
||||
#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS 0x8B49
|
||||
#define GL_MAX_VERTEX_UNIFORM_COMPONENTS 0x8B4A
|
||||
#define GL_SAMPLER_3D 0x8B5F
|
||||
#define GL_SAMPLER_2D_SHADOW 0x8B62
|
||||
#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT 0x8B8B
|
||||
#define GL_PIXEL_PACK_BUFFER 0x88EB
|
||||
#define GL_PIXEL_UNPACK_BUFFER 0x88EC
|
||||
#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED
|
||||
#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF
|
||||
#define GL_FLOAT_MAT2x3 0x8B65
|
||||
#define GL_FLOAT_MAT2x4 0x8B66
|
||||
#define GL_FLOAT_MAT3x2 0x8B67
|
||||
#define GL_FLOAT_MAT3x4 0x8B68
|
||||
#define GL_FLOAT_MAT4x2 0x8B69
|
||||
#define GL_FLOAT_MAT4x3 0x8B6A
|
||||
#define GL_SRGB 0x8C40
|
||||
#define GL_SRGB8 0x8C41
|
||||
#define GL_SRGB8_ALPHA8 0x8C43
|
||||
#define GL_COMPARE_REF_TO_TEXTURE 0x884E
|
||||
#define GL_MAJOR_VERSION 0x821B
|
||||
#define GL_MINOR_VERSION 0x821C
|
||||
#define GL_NUM_EXTENSIONS 0x821D
|
||||
#define GL_RGBA32F 0x8814
|
||||
#define GL_RGB32F 0x8815
|
||||
#define GL_RGBA16F 0x881A
|
||||
#define GL_RGB16F 0x881B
|
||||
#define GL_VERTEX_ATTRIB_ARRAY_INTEGER 0x88FD
|
||||
#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF
|
||||
#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904
|
||||
#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905
|
||||
#define GL_MAX_VARYING_COMPONENTS 0x8B4B
|
||||
#define GL_TEXTURE_2D_ARRAY 0x8C1A
|
||||
#define GL_TEXTURE_BINDING_2D_ARRAY 0x8C1D
|
||||
#define GL_R11F_G11F_B10F 0x8C3A
|
||||
#define GL_UNSIGNED_INT_10F_11F_11F_REV 0x8C3B
|
||||
#define GL_RGB9_E5 0x8C3D
|
||||
#define GL_UNSIGNED_INT_5_9_9_9_REV 0x8C3E
|
||||
#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH 0x8C76
|
||||
#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE 0x8C7F
|
||||
#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS 0x8C80
|
||||
#define GL_TRANSFORM_FEEDBACK_VARYINGS 0x8C83
|
||||
#define GL_TRANSFORM_FEEDBACK_BUFFER_START 0x8C84
|
||||
#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE 0x8C85
|
||||
#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN 0x8C88
|
||||
#define GL_RASTERIZER_DISCARD 0x8C89
|
||||
#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A
|
||||
#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS 0x8C8B
|
||||
#define GL_INTERLEAVED_ATTRIBS 0x8C8C
|
||||
#define GL_SEPARATE_ATTRIBS 0x8C8D
|
||||
#define GL_TRANSFORM_FEEDBACK_BUFFER 0x8C8E
|
||||
#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING 0x8C8F
|
||||
#define GL_RGBA32UI 0x8D70
|
||||
#define GL_RGB32UI 0x8D71
|
||||
#define GL_RGBA16UI 0x8D76
|
||||
#define GL_RGB16UI 0x8D77
|
||||
#define GL_RGBA8UI 0x8D7C
|
||||
#define GL_RGB8UI 0x8D7D
|
||||
#define GL_RGBA32I 0x8D82
|
||||
#define GL_RGB32I 0x8D83
|
||||
#define GL_RGBA16I 0x8D88
|
||||
#define GL_RGB16I 0x8D89
|
||||
#define GL_RGBA8I 0x8D8E
|
||||
#define GL_RGB8I 0x8D8F
|
||||
#define GL_RED_INTEGER 0x8D94
|
||||
#define GL_RGB_INTEGER 0x8D98
|
||||
#define GL_RGBA_INTEGER 0x8D99
|
||||
#define GL_SAMPLER_2D_ARRAY 0x8DC1
|
||||
#define GL_SAMPLER_2D_ARRAY_SHADOW 0x8DC4
|
||||
#define GL_SAMPLER_CUBE_SHADOW 0x8DC5
|
||||
#define GL_UNSIGNED_INT_VEC2 0x8DC6
|
||||
#define GL_UNSIGNED_INT_VEC3 0x8DC7
|
||||
#define GL_UNSIGNED_INT_VEC4 0x8DC8
|
||||
#define GL_INT_SAMPLER_2D 0x8DCA
|
||||
#define GL_INT_SAMPLER_3D 0x8DCB
|
||||
#define GL_INT_SAMPLER_CUBE 0x8DCC
|
||||
#define GL_INT_SAMPLER_2D_ARRAY 0x8DCF
|
||||
#define GL_UNSIGNED_INT_SAMPLER_2D 0x8DD2
|
||||
#define GL_UNSIGNED_INT_SAMPLER_3D 0x8DD3
|
||||
#define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4
|
||||
#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7
|
||||
#define GL_BUFFER_ACCESS_FLAGS 0x911F
|
||||
#define GL_BUFFER_MAP_LENGTH 0x9120
|
||||
#define GL_BUFFER_MAP_OFFSET 0x9121
|
||||
#define GL_DEPTH_COMPONENT32F 0x8CAC
|
||||
#define GL_DEPTH32F_STENCIL8 0x8CAD
|
||||
#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV 0x8DAD
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING 0x8210
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE 0x8211
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE 0x8212
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE 0x8213
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE 0x8214
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE 0x8215
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE 0x8216
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE 0x8217
|
||||
#define GL_FRAMEBUFFER_DEFAULT 0x8218
|
||||
#define GL_FRAMEBUFFER_UNDEFINED 0x8219
|
||||
#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
|
||||
#define GL_DEPTH_STENCIL 0x84F9
|
||||
#define GL_UNSIGNED_INT_24_8 0x84FA
|
||||
#define GL_DEPTH24_STENCIL8 0x88F0
|
||||
#define GL_UNSIGNED_NORMALIZED 0x8C17
|
||||
#define GL_DRAW_FRAMEBUFFER_BINDING GL_FRAMEBUFFER_BINDING
|
||||
#define GL_READ_FRAMEBUFFER 0x8CA8
|
||||
#define GL_DRAW_FRAMEBUFFER 0x8CA9
|
||||
#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA
|
||||
#define GL_RENDERBUFFER_SAMPLES 0x8CAB
|
||||
#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER 0x8CD4
|
||||
#define GL_MAX_COLOR_ATTACHMENTS 0x8CDF
|
||||
#define GL_COLOR_ATTACHMENT1 0x8CE1
|
||||
#define GL_COLOR_ATTACHMENT2 0x8CE2
|
||||
#define GL_COLOR_ATTACHMENT3 0x8CE3
|
||||
#define GL_COLOR_ATTACHMENT4 0x8CE4
|
||||
#define GL_COLOR_ATTACHMENT5 0x8CE5
|
||||
#define GL_COLOR_ATTACHMENT6 0x8CE6
|
||||
#define GL_COLOR_ATTACHMENT7 0x8CE7
|
||||
#define GL_COLOR_ATTACHMENT8 0x8CE8
|
||||
#define GL_COLOR_ATTACHMENT9 0x8CE9
|
||||
#define GL_COLOR_ATTACHMENT10 0x8CEA
|
||||
#define GL_COLOR_ATTACHMENT11 0x8CEB
|
||||
#define GL_COLOR_ATTACHMENT12 0x8CEC
|
||||
#define GL_COLOR_ATTACHMENT13 0x8CED
|
||||
#define GL_COLOR_ATTACHMENT14 0x8CEE
|
||||
#define GL_COLOR_ATTACHMENT15 0x8CEF
|
||||
#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE 0x8D56
|
||||
#define GL_MAX_SAMPLES 0x8D57
|
||||
#define GL_HALF_FLOAT 0x140B
|
||||
#define GL_MAP_READ_BIT 0x0001
|
||||
#define GL_MAP_WRITE_BIT 0x0002
|
||||
#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004
|
||||
#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008
|
||||
#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010
|
||||
#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020
|
||||
#define GL_RG 0x8227
|
||||
#define GL_RG_INTEGER 0x8228
|
||||
#define GL_R8 0x8229
|
||||
#define GL_RG8 0x822B
|
||||
#define GL_R16F 0x822D
|
||||
#define GL_R32F 0x822E
|
||||
#define GL_RG16F 0x822F
|
||||
#define GL_RG32F 0x8230
|
||||
#define GL_R8I 0x8231
|
||||
#define GL_R8UI 0x8232
|
||||
#define GL_R16I 0x8233
|
||||
#define GL_R16UI 0x8234
|
||||
#define GL_R32I 0x8235
|
||||
#define GL_R32UI 0x8236
|
||||
#define GL_RG8I 0x8237
|
||||
#define GL_RG8UI 0x8238
|
||||
#define GL_RG16I 0x8239
|
||||
#define GL_RG16UI 0x823A
|
||||
#define GL_RG32I 0x823B
|
||||
#define GL_RG32UI 0x823C
|
||||
#define GL_VERTEX_ARRAY_BINDING 0x85B5
|
||||
#define GL_R8_SNORM 0x8F94
|
||||
#define GL_RG8_SNORM 0x8F95
|
||||
#define GL_RGB8_SNORM 0x8F96
|
||||
#define GL_RGBA8_SNORM 0x8F97
|
||||
#define GL_SIGNED_NORMALIZED 0x8F9C
|
||||
#define GL_PRIMITIVE_RESTART_FIXED_INDEX 0x8D69
|
||||
#define GL_COPY_READ_BUFFER 0x8F36
|
||||
#define GL_COPY_WRITE_BUFFER 0x8F37
|
||||
#define GL_COPY_READ_BUFFER_BINDING GL_COPY_READ_BUFFER
|
||||
#define GL_COPY_WRITE_BUFFER_BINDING GL_COPY_WRITE_BUFFER
|
||||
#define GL_UNIFORM_BUFFER 0x8A11
|
||||
#define GL_UNIFORM_BUFFER_BINDING 0x8A28
|
||||
#define GL_UNIFORM_BUFFER_START 0x8A29
|
||||
#define GL_UNIFORM_BUFFER_SIZE 0x8A2A
|
||||
#define GL_MAX_VERTEX_UNIFORM_BLOCKS 0x8A2B
|
||||
#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS 0x8A2D
|
||||
#define GL_MAX_COMBINED_UNIFORM_BLOCKS 0x8A2E
|
||||
#define GL_MAX_UNIFORM_BUFFER_BINDINGS 0x8A2F
|
||||
#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30
|
||||
#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS 0x8A31
|
||||
#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS 0x8A33
|
||||
#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT 0x8A34
|
||||
#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH 0x8A35
|
||||
#define GL_ACTIVE_UNIFORM_BLOCKS 0x8A36
|
||||
#define GL_UNIFORM_TYPE 0x8A37
|
||||
#define GL_UNIFORM_SIZE 0x8A38
|
||||
#define GL_UNIFORM_NAME_LENGTH 0x8A39
|
||||
#define GL_UNIFORM_BLOCK_INDEX 0x8A3A
|
||||
#define GL_UNIFORM_OFFSET 0x8A3B
|
||||
#define GL_UNIFORM_ARRAY_STRIDE 0x8A3C
|
||||
#define GL_UNIFORM_MATRIX_STRIDE 0x8A3D
|
||||
#define GL_UNIFORM_IS_ROW_MAJOR 0x8A3E
|
||||
#define GL_UNIFORM_BLOCK_BINDING 0x8A3F
|
||||
#define GL_UNIFORM_BLOCK_DATA_SIZE 0x8A40
|
||||
#define GL_UNIFORM_BLOCK_NAME_LENGTH 0x8A41
|
||||
#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS 0x8A42
|
||||
#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES 0x8A43
|
||||
#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER 0x8A44
|
||||
#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER 0x8A46
|
||||
#define GL_INVALID_INDEX 0xFFFFFFFFu
|
||||
#define GL_MAX_VERTEX_OUTPUT_COMPONENTS 0x9122
|
||||
#define GL_MAX_FRAGMENT_INPUT_COMPONENTS 0x9125
|
||||
#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111
|
||||
#define GL_OBJECT_TYPE 0x9112
|
||||
#define GL_SYNC_CONDITION 0x9113
|
||||
#define GL_SYNC_STATUS 0x9114
|
||||
#define GL_SYNC_FLAGS 0x9115
|
||||
#define GL_SYNC_FENCE 0x9116
|
||||
#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117
|
||||
#define GL_UNSIGNALED 0x9118
|
||||
#define GL_SIGNALED 0x9119
|
||||
#define GL_ALREADY_SIGNALED 0x911A
|
||||
#define GL_TIMEOUT_EXPIRED 0x911B
|
||||
#define GL_CONDITION_SATISFIED 0x911C
|
||||
#define GL_WAIT_FAILED 0x911D
|
||||
#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001
|
||||
#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFFull
|
||||
#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR 0x88FE
|
||||
#define GL_ANY_SAMPLES_PASSED 0x8C2F
|
||||
#define GL_ANY_SAMPLES_PASSED_CONSERVATIVE 0x8D6A
|
||||
#define GL_SAMPLER_BINDING 0x8919
|
||||
#define GL_RGB10_A2UI 0x906F
|
||||
#define GL_TEXTURE_SWIZZLE_R 0x8E42
|
||||
#define GL_TEXTURE_SWIZZLE_G 0x8E43
|
||||
#define GL_TEXTURE_SWIZZLE_B 0x8E44
|
||||
#define GL_TEXTURE_SWIZZLE_A 0x8E45
|
||||
#define GL_GREEN 0x1904
|
||||
#define GL_BLUE 0x1905
|
||||
#define GL_INT_2_10_10_10_REV 0x8D9F
|
||||
#define GL_TRANSFORM_FEEDBACK 0x8E22
|
||||
#define GL_TRANSFORM_FEEDBACK_PAUSED 0x8E23
|
||||
#define GL_TRANSFORM_FEEDBACK_ACTIVE 0x8E24
|
||||
#define GL_TRANSFORM_FEEDBACK_BINDING 0x8E25
|
||||
#define GL_PROGRAM_BINARY_RETRIEVABLE_HINT 0x8257
|
||||
#define GL_PROGRAM_BINARY_LENGTH 0x8741
|
||||
#define GL_NUM_PROGRAM_BINARY_FORMATS 0x87FE
|
||||
#define GL_PROGRAM_BINARY_FORMATS 0x87FF
|
||||
#define GL_COMPRESSED_R11_EAC 0x9270
|
||||
#define GL_COMPRESSED_SIGNED_R11_EAC 0x9271
|
||||
#define GL_COMPRESSED_RG11_EAC 0x9272
|
||||
#define GL_COMPRESSED_SIGNED_RG11_EAC 0x9273
|
||||
#define GL_COMPRESSED_RGB8_ETC2 0x9274
|
||||
#define GL_COMPRESSED_SRGB8_ETC2 0x9275
|
||||
#define GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 0x9276
|
||||
#define GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2 0x9277
|
||||
#define GL_COMPRESSED_RGBA8_ETC2_EAC 0x9278
|
||||
#define GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC 0x9279
|
||||
#define GL_TEXTURE_IMMUTABLE_FORMAT 0x912F
|
||||
#define GL_MAX_ELEMENT_INDEX 0x8D6B
|
||||
#define GL_NUM_SAMPLE_COUNTS 0x9380
|
||||
#define GL_TEXTURE_IMMUTABLE_LEVELS 0x82DF
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
* Entrypoint definitions
|
||||
*-----------------------------------------------------------------------*/
|
||||
|
||||
/* OpenGL ES 3.0 */
|
||||
|
||||
extern GL_APICALL void (* GL_APIENTRY glReadBuffer) (GLenum mode);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
|
||||
extern GL_APICALL void (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
|
||||
extern GL_APICALL void (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
|
||||
extern GL_APICALL void (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
|
||||
extern GL_APICALL void (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
|
||||
extern GL_APICALL void (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
|
||||
extern GL_APICALL GLboolean (* GL_APIENTRY glIsQuery) (GLuint id);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
|
||||
extern GL_APICALL void (* GL_APIENTRY glEndQuery) (GLenum target);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
|
||||
extern GL_APICALL GLboolean (* GL_APIENTRY glUnmapBuffer) (GLenum target);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
|
||||
extern GL_APICALL void (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
|
||||
extern GL_APICALL void (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
|
||||
extern GL_APICALL GLvoid* (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
|
||||
extern GL_APICALL void (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBindVertexArray) (GLuint array);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
|
||||
extern GL_APICALL GLboolean (* GL_APIENTRY glIsVertexArray) (GLuint array);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
|
||||
extern GL_APICALL void (* GL_APIENTRY glEndTransformFeedback) (void);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
|
||||
extern GL_APICALL void (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
|
||||
extern GL_APICALL void (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
|
||||
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
|
||||
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
|
||||
extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
|
||||
extern GL_APICALL GLint (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
|
||||
extern GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
|
||||
extern GL_APICALL void (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
|
||||
extern GL_APICALL GLuint (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
|
||||
extern GL_APICALL void (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
|
||||
extern GL_APICALL GLsync (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
|
||||
extern GL_APICALL GLboolean (* GL_APIENTRY glIsSync) (GLsync sync);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDeleteSync) (GLsync sync);
|
||||
extern GL_APICALL GLenum (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
|
||||
extern GL_APICALL void (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
|
||||
extern GL_APICALL GLboolean (* GL_APIENTRY glIsSampler) (GLuint sampler);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
|
||||
extern GL_APICALL void (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
|
||||
extern GL_APICALL void (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
|
||||
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
|
||||
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
|
||||
extern GL_APICALL void (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
|
||||
extern GL_APICALL void (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
|
||||
extern GL_APICALL GLboolean (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
|
||||
extern GL_APICALL void (* GL_APIENTRY glPauseTransformFeedback) (void);
|
||||
extern GL_APICALL void (* GL_APIENTRY glResumeTransformFeedback) (void);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
|
||||
extern GL_APICALL void (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
|
||||
extern GL_APICALL void (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
|
||||
extern GL_APICALL void (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
|
||||
extern GL_APICALL void (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
|
||||
extern GL_APICALL void (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
|
||||
extern GL_APICALL void (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
|
||||
|
||||
#ifndef GL_EXT_texture_border_clamp
|
||||
#define GL_EXT_texture_border_clamp 1
|
||||
#define GL_TEXTURE_BORDER_COLOR_EXT 0x1004
|
||||
#define GL_CLAMP_TO_BORDER_EXT 0x812D
|
||||
extern GL_APICALL void (* GL_APIENTRY glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
|
||||
extern GL_APICALL void (* GL_APIENTRY glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
|
||||
extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
|
||||
#endif /* GL_EXT_texture_border_clamp */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
#endif
|
@ -1,2 +0,0 @@
|
||||
file(GLOB_RECURSE tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,249 +0,0 @@
|
||||
|
||||
#include "DataTransfer.h"
|
||||
#include "GLLogging.h"
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
inline uint16x4x4_t vld4_u16_aligned16(const uint16_t* address) {
|
||||
return vld4_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 16)));
|
||||
}
|
||||
|
||||
inline uint16x4_t vld1_u16_aligned8(const uint16_t* address) {
|
||||
return vld1_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 8)));
|
||||
}
|
||||
|
||||
inline void vst4_u16_aligned16(uint16_t* address, uint16x4x4_t data) {
|
||||
vst4_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 16)), data);
|
||||
}
|
||||
|
||||
inline void vst1_u16_aligned8(uint16_t* address, uint16x4_t data) {
|
||||
vst1_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 8)), data);
|
||||
}
|
||||
|
||||
template <int input_channels>
|
||||
static void interleaveSlice(
|
||||
void* output, const float* input, size_t width, size_t height, size_t row_stride) {
|
||||
const float* input_r = input;
|
||||
const float* input_g = input_r + height * width;
|
||||
const float* input_b = input_g + height * width;
|
||||
const float* input_a = input_b + height * width;
|
||||
uint16_t* output_f16 = static_cast<uint16_t*>(output);
|
||||
if (width >= 4) {
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
size_t nx = width;
|
||||
while (nx >= 4) {
|
||||
const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
|
||||
input_r += 4;
|
||||
uint16x4_t g, b, a;
|
||||
g = b = a = vdup_n_u16(0);
|
||||
if (input_channels >= 2) {
|
||||
g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
|
||||
input_g += 4;
|
||||
if (input_channels >= 3) {
|
||||
b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
|
||||
input_b += 4;
|
||||
if (input_channels >= 4) {
|
||||
a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
|
||||
input_a += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
|
||||
vst4_u16_aligned16(output_f16, rgba);
|
||||
output_f16 += 4 * 4;
|
||||
|
||||
nx -= 4;
|
||||
}
|
||||
if (nx != 0) {
|
||||
output_f16 -= (4 - nx) * 4;
|
||||
input_r -= 4 - nx;
|
||||
if (input_channels >= 2) {
|
||||
input_g -= 4 - nx;
|
||||
if (input_channels >= 3) {
|
||||
input_b -= 4 - nx;
|
||||
if (input_channels >= 4) {
|
||||
input_a -= 4 - nx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
|
||||
input_r += 4;
|
||||
uint16x4_t g, b, a;
|
||||
g = b = a = vdup_n_u16(0);
|
||||
if (input_channels >= 2) {
|
||||
g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
|
||||
input_g += 4;
|
||||
if (input_channels >= 3) {
|
||||
b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
|
||||
input_b += 4;
|
||||
if (input_channels >= 4) {
|
||||
a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
|
||||
input_a += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
|
||||
vst4_u16_aligned16(output_f16, rgba);
|
||||
output_f16 += 4 * 4;
|
||||
}
|
||||
output_f16 += (row_stride - width) * 4;
|
||||
}
|
||||
} else {
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
for (size_t x = 0; x < width; x++) {
|
||||
float32x4_t rgba = vld1q_dup_f32(input_r++);
|
||||
if (input_channels >= 2) {
|
||||
rgba = vld1q_lane_f32(input_g++, rgba, 1);
|
||||
if (input_channels >= 3) {
|
||||
rgba = vld1q_lane_f32(input_b++, rgba, 2);
|
||||
if (input_channels >= 4) {
|
||||
rgba = vld1q_lane_f32(input_a++, rgba, 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
vst1_u16_aligned8(output_f16, uint16x4_t(vcvt_f16_f32(rgba)));
|
||||
output_f16 += 4;
|
||||
}
|
||||
output_f16 += (row_stride - width) * 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void interleaveSlice(void* output,
|
||||
const float* input,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t row_stride,
|
||||
uint16_t input_channels) {
|
||||
switch (input_channels) {
|
||||
case 1:
|
||||
interleaveSlice<1>(output, input, width, height, row_stride);
|
||||
break;
|
||||
case 2:
|
||||
interleaveSlice<2>(output, input, width, height, row_stride);
|
||||
break;
|
||||
case 3:
|
||||
interleaveSlice<3>(output, input, width, height, row_stride);
|
||||
break;
|
||||
case 4:
|
||||
interleaveSlice<4>(output, input, width, height, row_stride);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <int output_channels>
|
||||
static void deInterleaveSlice(
|
||||
float* output, const void* input, size_t width, size_t height, size_t row_stride) {
|
||||
float* output_r = output;
|
||||
float* output_g = output_r + height * width;
|
||||
float* output_b = output_g + height * width;
|
||||
float* output_a = output_b + height * width;
|
||||
const uint16_t* input_f16 = static_cast<const uint16_t*>(input);
|
||||
if (width >= 4) {
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
size_t nx = width;
|
||||
while (nx >= 4) {
|
||||
const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
|
||||
input_f16 += 4 * 4;
|
||||
const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
|
||||
vst1q_f32(output_r, r);
|
||||
output_r += 4;
|
||||
if (output_channels >= 2) {
|
||||
const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
|
||||
vst1q_f32(output_g, g);
|
||||
output_g += 4;
|
||||
if (output_channels >= 3) {
|
||||
const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
|
||||
vst1q_f32(output_b, b);
|
||||
output_b += 4;
|
||||
if (output_channels >= 4) {
|
||||
const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
|
||||
vst1q_f32(output_a, a);
|
||||
output_a += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nx -= 4;
|
||||
}
|
||||
if (nx != 0) {
|
||||
input_f16 -= (4 - nx) * 4;
|
||||
output_r -= 4 - nx;
|
||||
if (output_channels >= 2) {
|
||||
output_g -= 4 - nx;
|
||||
if (output_channels >= 3) {
|
||||
output_b -= 4 - nx;
|
||||
if (output_channels >= 4) {
|
||||
output_a -= 4 - nx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
|
||||
input_f16 += 4 * 4;
|
||||
const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
|
||||
vst1q_f32(output_r, r);
|
||||
output_r += 4;
|
||||
if (output_channels >= 2) {
|
||||
const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
|
||||
vst1q_f32(output_g, g);
|
||||
output_g += 4;
|
||||
if (output_channels >= 3) {
|
||||
const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
|
||||
vst1q_f32(output_b, b);
|
||||
output_b += 4;
|
||||
if (output_channels >= 4) {
|
||||
const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
|
||||
vst1q_f32(output_a, a);
|
||||
output_a += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
input_f16 += (row_stride - width) * 4;
|
||||
}
|
||||
} else {
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
for (size_t x = 0; x < width; x++) {
|
||||
const float32x4_t rgba = vcvt_f32_f16(float16x4_t(vld1_u16_aligned8(input_f16)));
|
||||
input_f16 += 4;
|
||||
vst1q_lane_f32(output_r++, rgba, 0);
|
||||
if (output_channels >= 2) {
|
||||
vst1q_lane_f32(output_g++, rgba, 1);
|
||||
if (output_channels >= 3) {
|
||||
vst1q_lane_f32(output_b++, rgba, 2);
|
||||
if (output_channels >= 4) {
|
||||
vst1q_lane_f32(output_a++, rgba, 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
input_f16 += (row_stride - width) * 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void deInterleaveSlice(float* output,
|
||||
const void* input,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t row_stride,
|
||||
uint32_t output_channels) {
|
||||
switch (output_channels) {
|
||||
case 1:
|
||||
deInterleaveSlice<1>(output, input, width, height, row_stride);
|
||||
break;
|
||||
case 2:
|
||||
deInterleaveSlice<2>(output, input, width, height, row_stride);
|
||||
break;
|
||||
case 3:
|
||||
deInterleaveSlice<3>(output, input, width, height, row_stride);
|
||||
break;
|
||||
case 4:
|
||||
deInterleaveSlice<4>(output, input, width, height, row_stride);
|
||||
break;
|
||||
}
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arm_neon_support.h"
|
||||
|
||||
void interleaveSlice(void* output,
|
||||
const float* input,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t row_stride,
|
||||
uint16_t input_channels);
|
||||
void deInterleaveSlice(float* output,
|
||||
const void* input,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t input_stride,
|
||||
uint32_t output_channels);
|
@ -1,12 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#if CAFFE2_IOS
|
||||
#include <OpenGLES/ES3/gl.h>
|
||||
#include <OpenGLES/ES3/glext.h>
|
||||
#elif CAFFE2_ANDROID
|
||||
#include <EGL/egl.h>
|
||||
#include <GLES2/gl2.h>
|
||||
#include "caffe2/mobile/contrib/opengl/android/gl3stub.h"
|
||||
#endif
|
@ -1,126 +0,0 @@
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
#include "GL.h"
|
||||
#include "GLContext.h"
|
||||
#include "GLLogging.h"
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#if CAFFE2_IOS
|
||||
#include "sys/utsname.h"
|
||||
#include <regex>
|
||||
#endif
|
||||
|
||||
void getOpenGLESVersion(int& major, int& minor) {
|
||||
glGetIntegerv(GL_MAJOR_VERSION, &major);
|
||||
glGetIntegerv(GL_MINOR_VERSION, &minor);
|
||||
}
|
||||
|
||||
bool checkOpenGLExtensions(std::string gl_ext_str) {
|
||||
static std::unordered_set<std::string> extensions;
|
||||
if (extensions.empty()) {
|
||||
const caffe2::string extension_str((const char*)glGetString(GL_EXTENSIONS));
|
||||
LOG(INFO) << "GL_EXTENSIONS: " << extension_str;
|
||||
|
||||
std::stringstream ss(extension_str);
|
||||
while (!ss.eof()) {
|
||||
std::string extension;
|
||||
ss >> extension;
|
||||
extensions.insert(extension);
|
||||
}
|
||||
}
|
||||
|
||||
return extensions.count(gl_ext_str) > 0;
|
||||
}
|
||||
|
||||
bool GLContext::GL_EXT_texture_border_clamp_defined() {
|
||||
static int major = 0, minor = 0;
|
||||
if (major == 0) {
|
||||
getOpenGLESVersion(major, minor);
|
||||
}
|
||||
|
||||
if (major == 3 && minor == 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return checkOpenGLExtensions("GL_EXT_texture_border_clamp") || // Most common
|
||||
checkOpenGLExtensions("GL_OES_texture_border_clamp");
|
||||
}
|
||||
|
||||
bool supportOpenGLES3(bool* half_float_supported) {
|
||||
int major = 0, minor = 0;
|
||||
getOpenGLESVersion(major, minor);
|
||||
|
||||
LOG(INFO) << "GL_VERSION: OpenGL ES " << major << "." << minor;
|
||||
|
||||
if (major < 3) {
|
||||
LOG(ERROR) << "OpenGL ES 3.0 not supported";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!checkOpenGLExtensions("GL_EXT_color_buffer_half_float")) {
|
||||
LOG(ERROR) << "GL_EXT_color_buffer_half_float is not available";
|
||||
if (half_float_supported) {
|
||||
*half_float_supported = false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#if CAFFE2_IOS
|
||||
int iPhoneVersion() {
|
||||
static int version = 0;
|
||||
static std::once_flag once;
|
||||
std::call_once(once, [&]() {
|
||||
struct utsname systemInfo;
|
||||
uname(&systemInfo);
|
||||
std::string iphone_ver_str = systemInfo.machine;
|
||||
LOG(INFO) << systemInfo.machine;
|
||||
|
||||
if (iphone_ver_str.find("iPhone") != std::string::npos) {
|
||||
std::regex regStr("([0-9]+)");
|
||||
std::smatch matchs;
|
||||
if (std::regex_search(iphone_ver_str, matchs, regStr)) {
|
||||
version = stoi(matchs[0]);
|
||||
}
|
||||
}
|
||||
});
|
||||
return version;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CAFFE2_ANDROID
|
||||
// whitelist of supported GPUs
|
||||
bool isSupportedRenderer() {
|
||||
static std::unordered_set<std::string> supported_renderers = {
|
||||
"Adreno (TM) 540",
|
||||
"Adreno (TM) 530",
|
||||
"Adreno (TM) 510",
|
||||
"Adreno (TM) 430",
|
||||
"Adreno (TM) 418",
|
||||
"Mali-G71",
|
||||
"Mali-T880",
|
||||
"NVIDIA Tegra"};
|
||||
std::string rendererStr((const char*)glGetString(GL_RENDERER));
|
||||
LOG(INFO) << "GL_RENDERER: " << rendererStr;
|
||||
|
||||
int start = rendererStr.find_first_not_of(" ");
|
||||
int end = rendererStr.find_last_not_of(" ");
|
||||
rendererStr = rendererStr.substr(start, end - start + 1);
|
||||
return supported_renderers.count(rendererStr) > 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool isSupportedDevice() {
|
||||
#if CAFFE2_IOS
|
||||
return iPhoneVersion() >= 7; // iPhone 6 and up
|
||||
#elif CAFFE2_ANDROID
|
||||
return isSupportedRenderer();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
#include "GLTexture.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include <functional>
|
||||
|
||||
class GLContext {
|
||||
private:
|
||||
static std::unique_ptr<GLContext> _glcontext;
|
||||
std::function<const GLTexture*(const int width, const int height)> foreignTextureAllocator =
|
||||
nullptr;
|
||||
|
||||
protected:
|
||||
bool half_float_supported = true;
|
||||
|
||||
public:
|
||||
virtual void set_context() = 0;
|
||||
virtual void reset_context() = 0;
|
||||
virtual void flush_context() = 0;
|
||||
virtual ~GLContext(){};
|
||||
|
||||
static void initGLContext();
|
||||
static GLContext* getGLContext();
|
||||
static void deleteGLContext();
|
||||
|
||||
static bool GL_EXT_texture_border_clamp_defined();
|
||||
|
||||
inline bool halfFloatTextureSupported() { return half_float_supported; }
|
||||
|
||||
void setTextureAllocator(
|
||||
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
|
||||
foreignTextureAllocator = textureAllocator;
|
||||
}
|
||||
|
||||
std::function<const GLTexture*(const int width, const int height)> getTextureAllocator() {
|
||||
return foreignTextureAllocator;
|
||||
}
|
||||
};
|
||||
|
||||
bool supportOpenGLES3(bool* hfs = nullptr);
|
||||
|
||||
bool isSupportedDevice();
|
||||
|
||||
#if CAFFE2_IOS
|
||||
int iPhoneVersion();
|
||||
#endif
|
@ -1,567 +0,0 @@
|
||||
|
||||
#include "GLFilter.h"
|
||||
#include <sstream>
|
||||
|
||||
GLFilter::GLFilter(const std::string _kernel_name,
|
||||
const std::string _vertex_shader,
|
||||
const std::string _fragment_shader,
|
||||
const std::vector<binding*> uniforms,
|
||||
const std::vector<binding*> uniform_blocks,
|
||||
const std::vector<binding*> attributes,
|
||||
const replacements_t& _replacements)
|
||||
: kernel_name(_kernel_name),
|
||||
uniforms_(uniforms),
|
||||
uniform_blocks_(uniform_blocks),
|
||||
attributes_(attributes) {
|
||||
// shader program
|
||||
if (createProgram(_vertex_shader.c_str(),
|
||||
process_replacements(_fragment_shader, _replacements).c_str(),
|
||||
&program)) {
|
||||
gl_log(GL_VERBOSE, "created program %d\n", program);
|
||||
} else {
|
||||
releaseBuffers();
|
||||
|
||||
throwRuntimeError(
|
||||
[&](std::stringstream& errmsg) { errmsg << "Problem initializing OpenGL program"; });
|
||||
}
|
||||
}
|
||||
|
||||
const char* shader_utils = R"GLSL(
|
||||
#define unpackHalf4x16(pd) vec4(unpackHalf2x16(pd.x), unpackHalf2x16(pd.y))
|
||||
#define packHalf4x16(pd) uvec2(packHalf2x16(pd.xy), packHalf2x16(pd.zw))
|
||||
)GLSL";
|
||||
|
||||
const char* half_float_texture_utils = R"GLSL(
|
||||
precision mediump sampler2D;
|
||||
|
||||
#define TEXTURE_OUTPUT(_loc, _var) \
|
||||
layout(location = _loc) out mediump vec4 _var
|
||||
#define TEXTURE_INPUT(_var) \
|
||||
uniform sampler2D _var
|
||||
#define TEXTURE_LOAD(_input, _coord) \
|
||||
texelFetch((_input), (_coord), 0)
|
||||
#define TEXTURE_STORE(_val) \
|
||||
(_val)
|
||||
)GLSL";
|
||||
|
||||
const char* half_float_compat_texture_utils = R"GLSL(
|
||||
precision highp usampler2D;
|
||||
|
||||
#define TEXTURE_OUTPUT(_loc, _var) \
|
||||
layout(location = _loc) out highp uvec2 _var
|
||||
#define TEXTURE_INPUT(_var) \
|
||||
uniform usampler2D _var
|
||||
#define TEXTURE_LOAD(_input, _coord) \
|
||||
unpackHalf4x16(texelFetch((_input), (_coord), 0).xy)
|
||||
#define TEXTURE_STORE(_val) \
|
||||
(uvec2(packHalf4x16((_val))))
|
||||
)GLSL";
|
||||
|
||||
std::string GLFilter::process_replacements(std::string shader,
|
||||
const replacements_t& replacements) const {
|
||||
for (auto&& replacement : replacements) {
|
||||
std::string tag = "$(" + replacement.first + ")";
|
||||
std::string value = replacement.second;
|
||||
|
||||
size_t position = shader.find(tag);
|
||||
if (position != std::string::npos) {
|
||||
shader.replace(position, tag.size(), value);
|
||||
} else {
|
||||
throwRuntimeError(
|
||||
[&](std::stringstream& errmsg) { errmsg << "Couldn't find replacement tag: " << tag; });
|
||||
}
|
||||
}
|
||||
|
||||
// Add some #defines for convenience
|
||||
std::string version_tag = "#version 300 es";
|
||||
if (GLContext::getGLContext()->halfFloatTextureSupported()) {
|
||||
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
|
||||
} else {
|
||||
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
|
||||
}
|
||||
shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
|
||||
return shader;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GLFilter::attach_uniform_buffer(const binding* block,
|
||||
GLuint bindingPoint,
|
||||
std::function<void(T*, size_t)> loader) {
|
||||
if (block->location >= 0) {
|
||||
if (bindingPoint < kMaxUniformBlocks) {
|
||||
if (uniformBlock[bindingPoint] == 0) {
|
||||
// Associate the uniform block index with a binding point
|
||||
glUniformBlockBinding(program, block->location, bindingPoint);
|
||||
|
||||
// Get the size of block
|
||||
glGetActiveUniformBlockiv(program, block->location, GL_UNIFORM_BLOCK_DATA_SIZE, &blockSize[bindingPoint]);
|
||||
|
||||
// Create and fill a buffer object
|
||||
glGenBuffers(1, &uniformBlock[bindingPoint]);
|
||||
|
||||
gl_log(GL_VERBOSE, "created uniform buffer block %d\n", uniformBlock[bindingPoint]);
|
||||
}
|
||||
|
||||
// Fill a buffer object
|
||||
glBindBuffer(GL_UNIFORM_BUFFER, uniformBlock[bindingPoint]);
|
||||
glBufferData(GL_UNIFORM_BUFFER, blockSize[bindingPoint], NULL, GL_DYNAMIC_DRAW);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Unable to bind uniform buffer " << block->name << ":" << block->location
|
||||
<< " at binding point " << bindingPoint;
|
||||
});
|
||||
|
||||
T* blockData = (T*)glMapBufferRange(
|
||||
GL_UNIFORM_BUFFER, 0, blockSize[bindingPoint], GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
|
||||
if (blockData != NULL) {
|
||||
// Copy the data into the mapped buffer
|
||||
if (loader)
|
||||
loader(blockData, blockSize[bindingPoint]);
|
||||
|
||||
// Unmap the buffer
|
||||
if (glUnmapBuffer(GL_UNIFORM_BUFFER) == GL_TRUE) {
|
||||
// Bind the buffer object to the uniform block binding point
|
||||
glBindBufferBase(GL_UNIFORM_BUFFER, bindingPoint, uniformBlock[bindingPoint]);
|
||||
} else {
|
||||
throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "Error unmapping element buffer object"; });
|
||||
}
|
||||
} else {
|
||||
throwRuntimeError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Error mapping element buffer object, blockSize: " << blockSize;
|
||||
});
|
||||
}
|
||||
|
||||
glBindBuffer(GL_UNIFORM_BUFFER, 0);
|
||||
} else {
|
||||
throwRuntimeError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Uniform block binding point out of range: " << bindingPoint << ", should be < "
|
||||
<< kMaxUniformBlocks;
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "unbound uniform block"; });
|
||||
}
|
||||
}
|
||||
|
||||
template void GLFilter::attach_uniform_buffer<float16_t>(const binding* block,
|
||||
GLuint bindingPoint,
|
||||
std::function<void(float16_t*, size_t)> loader);
|
||||
|
||||
static const GLenum unused_capability[] = {GL_CULL_FACE,
|
||||
GL_BLEND,
|
||||
GL_DITHER,
|
||||
GL_STENCIL_TEST,
|
||||
GL_DEPTH_TEST,
|
||||
GL_SCISSOR_TEST,
|
||||
GL_POLYGON_OFFSET_FILL,
|
||||
GL_SAMPLE_ALPHA_TO_COVERAGE,
|
||||
GL_SAMPLE_COVERAGE};
|
||||
|
||||
void GLFilter::run(const std::vector<texture_attachment>& input,
|
||||
const std::vector<const GLTexture*>& output,
|
||||
std::function<void(void)> uniforms_initializer,
|
||||
int width,
|
||||
int height) {
|
||||
const int first_texture_id = GL_TEXTURE0;
|
||||
|
||||
GLint defaultFramebuffer = 0;
|
||||
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
|
||||
|
||||
gl_log(GL_VERBOSE,
|
||||
"GLFilter::run %s - inputs: %d, outputs: %d, width: %d, height: %d\n",
|
||||
kernel_name.c_str(),
|
||||
input.size(),
|
||||
output.size(),
|
||||
width,
|
||||
height);
|
||||
|
||||
if (output.size() > 4) {
|
||||
throwRuntimeError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Too many output textures: " << output.size() << ", should be <= 4";
|
||||
});
|
||||
}
|
||||
|
||||
if (frameBuffer == 0) {
|
||||
// create the frame buffer
|
||||
glGenFramebuffers(1, &frameBuffer);
|
||||
gl_log(GL_VERBOSE, "created frame buffer %d\n", frameBuffer);
|
||||
}
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, frameBuffer);
|
||||
checkGLError([&](std::stringstream& errmsg) { errmsg << "glBindFramebuffer"; });
|
||||
|
||||
// Set up the output textures
|
||||
for (int i = 0; i < output.size(); i++) {
|
||||
GLenum target = output[i]->target();
|
||||
GLuint texture = output[i]->name();
|
||||
|
||||
glBindTexture(target, texture);
|
||||
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, target, texture, 0);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Unable to connect output texture " << texture << " at color attachment " << i;
|
||||
});
|
||||
|
||||
gl_log(GL_VERBOSE, "connected output texture %d to color attachment %d\n", texture, i);
|
||||
}
|
||||
|
||||
// Bind the output textures to the frame buffer attachments
|
||||
if (!frame_buffer_initialized) {
|
||||
const int attachments_number = output.size();
|
||||
const GLenum attachments[4] = {
|
||||
GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1, GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3};
|
||||
|
||||
glDrawBuffers(attachments_number, attachments);
|
||||
|
||||
int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
|
||||
|
||||
if (fbs != GL_FRAMEBUFFER_COMPLETE) {
|
||||
throwRuntimeError(
|
||||
[&](std::stringstream& errmsg) { errmsg << "Frame buffer incomplete: " << fbs; });
|
||||
}
|
||||
|
||||
frame_buffer_initialized = true;
|
||||
}
|
||||
|
||||
glUseProgram(program);
|
||||
checkGLError([&](std::stringstream& errmsg) { errmsg << "glUseProgram"; });
|
||||
|
||||
// Set up the input textures
|
||||
GLenum texture_idx = first_texture_id;
|
||||
for (int i = 0; i < input.size(); i++, texture_idx++) {
|
||||
if (input[i].uniform->location >= 0) {
|
||||
GLenum target = input[i].texture->target();
|
||||
GLuint texture = input[i].texture->name();
|
||||
|
||||
glActiveTexture(texture_idx);
|
||||
glBindTexture(target, texture);
|
||||
glUniform1i(input[i].uniform->location, texture_idx - GL_TEXTURE0);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << ": Unable to attach input texture " << texture << " to uniform "
|
||||
<< input[i].uniform->name << ":" << input[i].uniform->location << " at index "
|
||||
<< texture_idx - GL_TEXTURE0;
|
||||
});
|
||||
|
||||
gl_log(GL_VERBOSE,
|
||||
"connected input texture %d to texture unit %d\n",
|
||||
texture,
|
||||
texture_idx - GL_TEXTURE0);
|
||||
} else {
|
||||
gl_log(GL_VERBOSE, "something wrong happened when i = %d\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
// Caller supplied uniforms initializer
|
||||
if (uniforms_initializer) {
|
||||
uniforms_initializer();
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "errors in the uniforms initializer callback";
|
||||
});
|
||||
}
|
||||
|
||||
// Validate program
|
||||
if (check_opengl_errors && !validateProgram(program)) {
|
||||
throwRuntimeError(
|
||||
[&](std::stringstream& errmsg) { errmsg << "Couldn't validate OpenGL program"; });
|
||||
}
|
||||
|
||||
glViewport(0, 0, width, height);
|
||||
|
||||
// Disable stuff we don't need and make sure that we have all the channels ebabled
|
||||
for (int i = 0; i < sizeof(unused_capability) / sizeof(GLenum); i++) {
|
||||
glDisable(unused_capability[i]);
|
||||
}
|
||||
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
|
||||
|
||||
// glDrawElements should be more efficient, but on iOS glDrawArrays is faster.
|
||||
|
||||
const bool useDrawArrays = true;
|
||||
|
||||
if (useDrawArrays) {
|
||||
enum { ATTRIB_VERTEX, ATTRIB_TEXTUREPOSITON, NUM_ATTRIBUTES };
|
||||
|
||||
static const GLfloat squareVertices[] = {
|
||||
-1.0f,
|
||||
-1.0f, // bottom left
|
||||
1.0f,
|
||||
-1.0f, // bottom right
|
||||
-1.0f,
|
||||
1.0f, // top left
|
||||
1.0f,
|
||||
1.0f, // top right
|
||||
};
|
||||
|
||||
static const float textureVertices[] = {
|
||||
0.0f,
|
||||
0.0f, // bottom left
|
||||
1.0f,
|
||||
0.0f, // bottom right
|
||||
0.0f,
|
||||
1.0f, // top left
|
||||
1.0f,
|
||||
1.0f, // top right
|
||||
};
|
||||
|
||||
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
||||
glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, squareVertices);
|
||||
glEnableVertexAttribArray(ATTRIB_VERTEX);
|
||||
checkGLError(
|
||||
[&](std::stringstream& errmsg) { errmsg << "glEnableVertexAttribArray(ATTRIB_VERTEX)"; });
|
||||
|
||||
glVertexAttribPointer(ATTRIB_TEXTUREPOSITON, 2, GL_FLOAT, 0, 0, textureVertices);
|
||||
glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON);
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON)";
|
||||
});
|
||||
|
||||
gl_log(GL_VERBOSE, "Calling glDrawArrays\n");
|
||||
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawArrays"; });
|
||||
} else {
|
||||
// Run the shaders on the output geometry
|
||||
static const GLfloat vVertices[] = {
|
||||
-1.0f, -1.0f, 0.0f, // Position 0
|
||||
0.0f, 0.0f, // TexCoord 0
|
||||
-1.0f, 1.0f, 0.0f, // Position 1
|
||||
0.0f, 1.0f, // TexCoord 1
|
||||
1.0f, 1.0f, 0.0f, // Position 2
|
||||
1.0f, 1.0f, // TexCoord 2
|
||||
1.0f, -1.0f, 0.0f, // Position 3
|
||||
1.0f, 0.0f // TexCoord 3
|
||||
};
|
||||
static const GLushort indices[] = {0, 1, 2, 0, 2, 3};
|
||||
|
||||
// Load the vertex position
|
||||
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), vVertices);
|
||||
// Load the texture coordinate
|
||||
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), &vVertices[3]);
|
||||
|
||||
glEnableVertexAttribArray(0);
|
||||
glEnableVertexAttribArray(1);
|
||||
|
||||
gl_log(GL_VERBOSE, "Calling glDrawElements\n");
|
||||
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, indices);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawElements"; });
|
||||
}
|
||||
|
||||
#if CAFFE2_ANDROID
|
||||
glFlush();
|
||||
#endif
|
||||
|
||||
// Unbind the current texture - Man, this is expensive!
|
||||
for (int i = texture_idx - 1; i >= first_texture_id; i--) {
|
||||
gl_log(GL_VERBOSE, "unbinding texture unit %d\n", i - GL_TEXTURE0);
|
||||
glActiveTexture(i);
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Error unbinding texture unit " << i - GL_TEXTURE0;
|
||||
});
|
||||
}
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
|
||||
}
|
||||
|
||||
void GLFilter::releaseBuffers() {
|
||||
for (int i = 0; i < kMaxUniformBlocks; i++) {
|
||||
if (uniformBlock[i]) {
|
||||
gl_log(GL_VERBOSE, "deleting uniform buffer block %d\n", uniformBlock[i]);
|
||||
glDeleteBuffers(1, &uniformBlock[i]);
|
||||
uniformBlock[i] = 0;
|
||||
}
|
||||
}
|
||||
if (frameBuffer) {
|
||||
gl_log(GL_VERBOSE, "deleting frame buffer %d\n", frameBuffer);
|
||||
glDeleteFramebuffers(1, &frameBuffer);
|
||||
frameBuffer = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void GLFilter::deleteProgram() {
|
||||
if (program) {
|
||||
gl_log(GL_VERBOSE, "deleting program %d\n", program);
|
||||
glDeleteProgram(program);
|
||||
program = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void GLFilter::deleteBindings() {
|
||||
for (binding* uniform : uniforms_) {
|
||||
delete uniform;
|
||||
}
|
||||
for (binding* uniform_block : uniform_blocks_) {
|
||||
delete uniform_block;
|
||||
}
|
||||
for (binding* attribute : attributes_) {
|
||||
delete attribute;
|
||||
}
|
||||
}
|
||||
|
||||
// Simple vertex shader setting up the coordinates system
|
||||
const char* GLFilter::vertex_shader = R"GLSL(#version 300 es
|
||||
|
||||
layout(location = 0) in vec4 a_position;
|
||||
layout(location = 1) in vec2 a_texCoord;
|
||||
out vec2 v_texCoord;
|
||||
|
||||
void main()
|
||||
{
|
||||
gl_Position = a_position;
|
||||
v_texCoord = a_texCoord;
|
||||
}
|
||||
)GLSL";
|
||||
|
||||
bool GLFilter::createProgram(const GLchar* vertSource,
|
||||
const GLchar* fragSource,
|
||||
GLuint* program) const {
|
||||
GLuint vertShader = 0, fragShader = 0, prog = 0, status = 1;
|
||||
|
||||
// Clear the error state. We check error state later in the function and
|
||||
// want to capture only errors in filter program initialization.
|
||||
glGetError();
|
||||
|
||||
// Create shader program
|
||||
prog = glCreateProgram();
|
||||
|
||||
// Create and compile vertex shader
|
||||
status *= compileShader(GL_VERTEX_SHADER, 1, &vertSource, &vertShader);
|
||||
|
||||
// Create and compile fragment shader
|
||||
status *= compileShader(GL_FRAGMENT_SHADER, 1, &fragSource, &fragShader);
|
||||
|
||||
// Attach vertex shader to program
|
||||
glAttachShader(prog, vertShader);
|
||||
|
||||
// Attach fragment shader to program
|
||||
glAttachShader(prog, fragShader);
|
||||
|
||||
// Bind attribute locations
|
||||
// This needs to be done prior to linking
|
||||
for (auto&& attribute : attributes_) {
|
||||
glBindAttribLocation(prog, attribute->location, attribute->name.c_str());
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Couldn't bind attribute: " << attribute->name << " at location "
|
||||
<< attribute->location;
|
||||
});
|
||||
}
|
||||
|
||||
// Link program
|
||||
status *= linkProgram(prog);
|
||||
|
||||
// Get locations of uniforms
|
||||
if (status) {
|
||||
for (auto&& uniform : uniforms_) {
|
||||
uniform->location = glGetUniformLocation(prog, uniform->name.c_str());
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Couldn't resolve uniform: " << uniform->name;
|
||||
});
|
||||
}
|
||||
|
||||
for (auto&& uniform_block : uniform_blocks_) {
|
||||
uniform_block->location = glGetUniformBlockIndex(prog, uniform_block->name.c_str());
|
||||
gl_log(GL_VERBOSE,
|
||||
"Getting location for uniform block: %s, location: %d\n",
|
||||
uniform_block->name.c_str(),
|
||||
uniform_block->location);
|
||||
|
||||
checkGLError([&](std::stringstream& errmsg) {
|
||||
errmsg << "Couldn't resolve uniform block: " << uniform_block->name;
|
||||
});
|
||||
}
|
||||
|
||||
*program = prog;
|
||||
}
|
||||
|
||||
// Release vertex and fragment shaders
|
||||
if (vertShader) {
|
||||
glDetachShader(prog, vertShader);
|
||||
glDeleteShader(vertShader);
|
||||
}
|
||||
if (fragShader) {
|
||||
glDetachShader(prog, fragShader);
|
||||
glDeleteShader(fragShader);
|
||||
}
|
||||
|
||||
return status == 1;
|
||||
}
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
/* Compile a shader from the provided source(s) */
|
||||
GLint GLFilter::compileShader(GLenum target,
|
||||
GLsizei count,
|
||||
const GLchar** sources,
|
||||
GLuint* shader) const {
|
||||
GLint status = 1;
|
||||
|
||||
*shader = glCreateShader(target);
|
||||
glShaderSource(*shader, count, sources, NULL);
|
||||
glCompileShader(*shader);
|
||||
|
||||
GLint logLength = 0;
|
||||
glGetShaderiv(*shader, GL_INFO_LOG_LENGTH, &logLength);
|
||||
if (logLength > 0) {
|
||||
std::vector<GLchar> log(logLength);
|
||||
glGetShaderInfoLog(*shader, logLength, &logLength, &log[0]);
|
||||
gl_log(GL_ERR, "Shader compile log:\n%s", &log[0]);
|
||||
}
|
||||
|
||||
glGetShaderiv(*shader, GL_COMPILE_STATUS, &status);
|
||||
if (status == 0) {
|
||||
int i;
|
||||
|
||||
gl_log(GL_ERR, "Failed to compile shader:\n");
|
||||
for (i = 0; i < count; i++)
|
||||
gl_log(GL_ERR, "%s", sources[i]);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/* Link a program with all currently attached shaders */
|
||||
GLint GLFilter::linkProgram(GLuint program) const {
|
||||
GLint status = 1;
|
||||
|
||||
glLinkProgram(program);
|
||||
|
||||
GLint logLength = 0;
|
||||
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
|
||||
if (logLength > 0) {
|
||||
std::vector<GLchar> log(logLength);
|
||||
glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
|
||||
gl_log(GL_ERR, "Program link log:\n%s", &log[0]);
|
||||
}
|
||||
|
||||
glGetProgramiv(program, GL_LINK_STATUS, &status);
|
||||
if (status == 0)
|
||||
gl_log(GL_ERR, "Failed to link program %d\n", program);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/* Validate a program (for i.e. inconsistent samplers) */
|
||||
GLint GLFilter::validateProgram(GLuint program) const {
|
||||
GLint status = 1;
|
||||
|
||||
glValidateProgram(program);
|
||||
|
||||
GLint logLength = 0;
|
||||
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
|
||||
if (logLength > 0) {
|
||||
std::vector<GLchar> log(logLength);
|
||||
glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
|
||||
gl_log(GL_ERR, "Program validate log:\n%s", &log[0]);
|
||||
}
|
||||
|
||||
glGetProgramiv(program, GL_VALIDATE_STATUS, &status);
|
||||
if (status == 0)
|
||||
gl_log(GL_ERR, "Failed to validate program %d\n", program);
|
||||
|
||||
return status;
|
||||
}
|
@ -1,104 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLContext.h"
|
||||
#include "GLTexture.h"
|
||||
#include "arm_neon_support.h"
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define BINDING(variableName) (variableName = new binding{#variableName})
|
||||
#define ATTRIBUTE(variableName, value) (variableName = new binding{#variableName, value})
|
||||
|
||||
class GLFilter {
|
||||
protected:
|
||||
const std::string kernel_name;
|
||||
GLuint program = 0;
|
||||
GLuint frameBuffer = 0;
|
||||
static constexpr int kMaxUniformBlocks = 12;
|
||||
GLuint uniformBlock[kMaxUniformBlocks] = {0};
|
||||
GLint blockSize[kMaxUniformBlocks] = {0};
|
||||
bool frame_buffer_initialized = false;
|
||||
|
||||
// glGetError() can be expensive, we should turn error checking off when we're done with debugging
|
||||
|
||||
static constexpr bool check_opengl_errors = true;
|
||||
|
||||
public:
|
||||
typedef std::vector<std::pair<std::string, std::string>> replacements_t;
|
||||
|
||||
struct binding {
|
||||
const std::string name;
|
||||
GLint location;
|
||||
};
|
||||
|
||||
struct texture_attachment {
|
||||
const GLTexture* texture;
|
||||
const binding* uniform;
|
||||
};
|
||||
|
||||
GLFilter(const std::string kernel_name,
|
||||
const std::string vertex_shader,
|
||||
const std::string fragment_shader,
|
||||
const std::vector<binding*> uniforms,
|
||||
const std::vector<binding*> uniform_blocks = {},
|
||||
const std::vector<binding*> attributes = {},
|
||||
const replacements_t& replacements = {});
|
||||
|
||||
// TODO: The set and reset context need to be commented out for unit testing
|
||||
~GLFilter() {
|
||||
releaseBuffers();
|
||||
deleteProgram();
|
||||
deleteBindings();
|
||||
}
|
||||
|
||||
void throwRuntimeError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
|
||||
std::stringstream errmsg;
|
||||
errmsg << kernel_name << ": ";
|
||||
error_formatter(errmsg);
|
||||
throw std::runtime_error(errmsg.str());
|
||||
}
|
||||
|
||||
void checkGLError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
|
||||
if (check_opengl_errors) {
|
||||
GLenum glError = glGetError();
|
||||
if (glError != GL_NO_ERROR) {
|
||||
throwRuntimeError([&](std::stringstream& errmsg) {
|
||||
error_formatter(errmsg);
|
||||
errmsg << ", " << glError;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void attach_uniform_buffer(const binding* block,
|
||||
GLuint bindingPoint, std::function<void(T*, size_t)> loader);
|
||||
|
||||
void run(const std::vector<texture_attachment>& input,
|
||||
const std::vector<const GLTexture*>& output,
|
||||
std::function<void(void)> uniforms_initializer,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
void releaseBuffers();
|
||||
void deleteProgram();
|
||||
void deleteBindings();
|
||||
|
||||
static const char* vertex_shader;
|
||||
|
||||
private:
|
||||
const std::vector<binding*> uniforms_;
|
||||
const std::vector<binding*> uniform_blocks_;
|
||||
const std::vector<binding*> attributes_;
|
||||
|
||||
std::string process_replacements(std::string source, const replacements_t& replacements) const;
|
||||
|
||||
bool createProgram(const GLchar* vertSource, const GLchar* fragSource, GLuint* program) const;
|
||||
|
||||
GLint compileShader(GLenum target, GLsizei count, const GLchar** sources, GLuint* shader) const;
|
||||
GLint linkProgram(GLuint program) const;
|
||||
GLint validateProgram(GLuint program) const;
|
||||
};
|
@ -1,15 +0,0 @@
|
||||
|
||||
#include "GLImage.h"
|
||||
#include "arm_neon_support.h"
|
||||
#include <c10/util/typeid.h>
|
||||
|
||||
namespace caffe2 {
|
||||
CAFFE_KNOWN_TYPE(GLImage<float>);
|
||||
CAFFE_KNOWN_TYPE(GLImage<uint8_t>);
|
||||
CAFFE_KNOWN_TYPE(GLImageVector<float>);
|
||||
CAFFE_KNOWN_TYPE(GLImageVector<uint8_t>);
|
||||
#ifdef __ARM_NEON__
|
||||
CAFFE_KNOWN_TYPE(GLImage<float16_t>);
|
||||
CAFFE_KNOWN_TYPE(GLImageVector<float16_t>);
|
||||
#endif
|
||||
} // namespace caffe2
|
@ -1,151 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLTexture.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
class GLImage {
|
||||
public:
|
||||
const int width;
|
||||
const int height;
|
||||
const int channels;
|
||||
const int data_size;
|
||||
|
||||
const int tile_x;
|
||||
const int tile_y;
|
||||
const int texture_width;
|
||||
const int texture_height;
|
||||
const int slices;
|
||||
|
||||
const std::vector<const GLTexture*> textures;
|
||||
|
||||
constexpr static int slice_channels = 4;
|
||||
|
||||
static constexpr int channels_to_slices(int channels, int tile_x, int tile_y) {
|
||||
return ((channels + slice_channels - 1) / slice_channels + tile_x * tile_y - 1) /
|
||||
(tile_x * tile_y);
|
||||
}
|
||||
|
||||
static const std::vector<const GLTexture*> allocate_textures(
|
||||
int slices, std::function<const GLTexture*(int slice)> texture_loader) {
|
||||
std::vector<const GLTexture*> textures;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
textures.push_back(texture_loader(i));
|
||||
}
|
||||
return textures;
|
||||
}
|
||||
|
||||
GLImage(int _width,
|
||||
int _height,
|
||||
int _channels,
|
||||
int _tile_x,
|
||||
int _tile_y,
|
||||
std::function<const GLTexture*(int slice)> texture_loader)
|
||||
: width(_width),
|
||||
height(_height),
|
||||
channels(_channels),
|
||||
data_size(sizeof(T)),
|
||||
tile_x(_tile_x),
|
||||
tile_y(_tile_y),
|
||||
texture_width(_width * _tile_x),
|
||||
texture_height(_height * _tile_y),
|
||||
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
|
||||
textures(allocate_textures(slices, texture_loader)) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
slices, ((channels + 3) / 4 + tile_x * tile_y - 1) / (tile_x * tile_y));
|
||||
}
|
||||
|
||||
GLImage(int _width,
|
||||
int _height,
|
||||
int _channels,
|
||||
int _tile_x,
|
||||
int _tile_y,
|
||||
bool _destroy,
|
||||
std::function<const GLTexture*(int slice)> texture_loader)
|
||||
: width(_width),
|
||||
height(_height),
|
||||
channels(_channels),
|
||||
data_size(sizeof(T)),
|
||||
tile_x(_tile_x),
|
||||
tile_y(_tile_y),
|
||||
texture_width(_width * _tile_x),
|
||||
texture_height(_height * _tile_y),
|
||||
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
|
||||
textures(allocate_textures(slices, texture_loader)) {
|
||||
CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
|
||||
}
|
||||
|
||||
GLImage()
|
||||
: width(0),
|
||||
height(0),
|
||||
channels(0),
|
||||
data_size(sizeof(T)),
|
||||
tile_x(0),
|
||||
tile_y(0),
|
||||
texture_width(0),
|
||||
texture_height(0),
|
||||
slices(0){};
|
||||
|
||||
virtual ~GLImage() {
|
||||
gl_log(GL_VERBOSE, "deleting GLImage\n");
|
||||
for (auto&& texture : textures) {
|
||||
delete texture;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class GLImageVector {
|
||||
private:
|
||||
std::vector<GLImage<T>*> images_;
|
||||
int num_images_ = 0;
|
||||
int width_ = 0;
|
||||
int height_ = 0;
|
||||
int channels_ = 0;
|
||||
int tile_x_ = 0;
|
||||
int tile_y_ = 0;
|
||||
|
||||
public:
|
||||
GLImage<T>* operator[](int index) const {
|
||||
CAFFE_ENFORCE_LT(index, num_images_, "Out of bounds when accessing GLImageVector");
|
||||
return images_[index];
|
||||
}
|
||||
|
||||
void push_back(GLImage<T>* image) {
|
||||
CAFFE_ENFORCE_EQ(image->channels, channels_);
|
||||
CAFFE_ENFORCE_EQ(image->width, width_);
|
||||
CAFFE_ENFORCE_EQ(image->height, height_);
|
||||
CAFFE_ENFORCE_EQ(image->tile_x, tile_x_);
|
||||
CAFFE_ENFORCE_EQ(image->tile_y, tile_y_);
|
||||
images_.push_back(image);
|
||||
CAFFE_ENFORCE_LE(images_.size(), num_images_);
|
||||
}
|
||||
|
||||
int size() const { return images_.size(); }
|
||||
int channels() const { return channels_; }
|
||||
int width() const { return width_; }
|
||||
int height() const { return height_; }
|
||||
int tile_x() const { return tile_x_; }
|
||||
int tile_y() const { return tile_y_; }
|
||||
int slices() const { return size() > 0 ? images_[0]->slices : 0; }
|
||||
|
||||
GLImageVector(int num_images, int width, int height, int channels, int tile_x = 1, int tile_y = 1)
|
||||
: num_images_(num_images),
|
||||
width_(width),
|
||||
height_(height),
|
||||
channels_(channels),
|
||||
tile_x_(tile_x),
|
||||
tile_y_(tile_y) {}
|
||||
|
||||
GLImageVector() {}
|
||||
|
||||
~GLImageVector() {
|
||||
for (int i = 0; i < images_.size(); i++) {
|
||||
delete images_[i];
|
||||
}
|
||||
}
|
||||
};
|
@ -1,66 +0,0 @@
|
||||
|
||||
#include "GLImageAllocator.h"
|
||||
#include "arm_neon_support.h"
|
||||
|
||||
template <class T>
|
||||
GLImageVector<T>* GLImageAllocator<T>::newImage(
|
||||
int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output) {
|
||||
GLImageVector<T>* images =
|
||||
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
images->push_back(
|
||||
new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
|
||||
bool usePadding = is_output;
|
||||
return new GLPlainTexture(type, nullptr, width * tile_x, height * tile_y, usePadding);
|
||||
}));
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
GLImageVector<T>* GLImageAllocator<T>::newImage(
|
||||
int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y,
|
||||
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
|
||||
GLImageVector<T>* images =
|
||||
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
images->push_back(
|
||||
new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
|
||||
return textureAllocator(width, height);
|
||||
}));
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
GLImageVector<T>* GLImageAllocator<T>::ShareTexture(const GLuint textureID,
|
||||
int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y) {
|
||||
GLImageVector<T>* images =
|
||||
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
images->push_back(
|
||||
new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
|
||||
return new GLPlainTexture(
|
||||
GLImageAllocator<T>::type, textureID, width * tile_x, height * tile_y);
|
||||
}));
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
template <>
|
||||
const GLTexture::Type& GLImageAllocator<float16_t>::type = GLTexture::FP16;
|
||||
template <>
|
||||
const GLTexture::Type& GLImageAllocator<uint8_t>::type = GLTexture::UI8;
|
||||
|
||||
template class GLImageAllocator<float16_t>;
|
||||
template class GLImageAllocator<uint8_t>;
|
@ -1,37 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLImage.h"
|
||||
#include "GLPlainTexture.h"
|
||||
|
||||
template <class T>
|
||||
class GLImageAllocator {
|
||||
public:
|
||||
static const GLTexture::Type& type;
|
||||
|
||||
GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
|
||||
|
||||
virtual ~GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
|
||||
|
||||
virtual GLImageVector<T>* newImage(
|
||||
int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output);
|
||||
|
||||
virtual GLImageVector<T>* newImage(
|
||||
int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y,
|
||||
std::function<const GLTexture*(const int width, const int height)> textureAllocator);
|
||||
|
||||
virtual GLImageVector<T>* ShareTexture(const GLuint textureID,
|
||||
int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x = 1,
|
||||
int tile_y = 1);
|
||||
|
||||
static GLImageAllocator<T>* newGLImageAllocator();
|
||||
};
|
@ -1,20 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
enum { GL_ERR = -1, GL_LOG = 0, GL_VERBOSE = 1 };
|
||||
|
||||
static constexpr int GL_LOG_LEVEL = GL_LOG;
|
||||
|
||||
static inline int gl_log(int level, const char* format, ...) {
|
||||
int r = 0;
|
||||
if (level <= GL_LOG_LEVEL) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
r = vfprintf(stderr, format, args);
|
||||
va_end(args);
|
||||
}
|
||||
return r;
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
|
||||
#include "GLPBO.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
GLPBO::~GLPBO() {
|
||||
if (pboId != 0) {
|
||||
gl_log(GL_LOG, "deleting PBO buffer %d\n", pboId);
|
||||
glDeleteBuffers(1, &pboId);
|
||||
pboId = 0;
|
||||
}
|
||||
if (pboFrameBuffer != 0) {
|
||||
gl_log(GL_LOG, "deleting PBO frame buffer %d\n", pboFrameBuffer);
|
||||
glDeleteFramebuffers(1, &pboFrameBuffer);
|
||||
pboFrameBuffer = 0;
|
||||
}
|
||||
}
|
||||
|
||||
GLPBO* GLPBO::pboContext = NULL;
|
||||
|
||||
GLPBO* GLPBO::getContext() {
|
||||
if (pboContext == NULL) {
|
||||
pboContext = new GLPBO();
|
||||
}
|
||||
return pboContext;
|
||||
}
|
||||
|
||||
void GLPBO::mapTextureData(GLuint _textureId,
|
||||
GLsizei _width,
|
||||
GLsizei _height,
|
||||
GLsizei _stride,
|
||||
GLsizei _channels,
|
||||
const GLTexture::Type& _type,
|
||||
std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const GLTexture::Type& type)> process) {
|
||||
GLint defaultFramebuffer = 0;
|
||||
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
|
||||
|
||||
if (pboFrameBuffer == 0) {
|
||||
glGenFramebuffers(1, &pboFrameBuffer);
|
||||
gl_log(GL_VERBOSE, "created PBO frame buffer %d\n", pboFrameBuffer);
|
||||
}
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, pboFrameBuffer);
|
||||
|
||||
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _textureId, 0);
|
||||
|
||||
int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
|
||||
if (fbs != GL_FRAMEBUFFER_COMPLETE) {
|
||||
std::stringstream errmsg;
|
||||
errmsg << ": Frame buffer incomplete: " << fbs;
|
||||
throw std::runtime_error(errmsg.str());
|
||||
}
|
||||
|
||||
if (pboId == 0) {
|
||||
glGenBuffers(1, &pboId);
|
||||
gl_log(GL_VERBOSE, "created PBO buffer %d\n", pboId);
|
||||
}
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, pboId);
|
||||
|
||||
size_t buffer_size = _stride * _height * _channels * _type.dataSize();
|
||||
|
||||
if (buffer_size > pboSize) {
|
||||
LOG(INFO) << "Allocating PBO of capacity " << buffer_size;
|
||||
|
||||
glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, NULL, GL_DYNAMIC_READ);
|
||||
pboSize = buffer_size;
|
||||
}
|
||||
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT0);
|
||||
glReadPixels(0, 0, _stride, _height, _type.format, _type.type, 0);
|
||||
|
||||
GLhalf* ptr = (GLhalf*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, buffer_size, GL_MAP_READ_BIT);
|
||||
|
||||
if (ptr) {
|
||||
process(ptr, _width, _height, _stride, _channels, _type);
|
||||
} else {
|
||||
std::stringstream errmsg;
|
||||
errmsg << ": glMapBufferRange using PBO incomplete";
|
||||
throw std::runtime_error(errmsg.str());
|
||||
}
|
||||
|
||||
// Unmap buffer
|
||||
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
||||
|
||||
// Bind to the default FrameBuffer
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLTexture.h"
|
||||
#include <functional>
|
||||
|
||||
class GLPBO {
|
||||
GLuint pboId = 0;
|
||||
GLuint pboSize = 0;
|
||||
GLuint pboFrameBuffer = 0;
|
||||
|
||||
~GLPBO();
|
||||
|
||||
static GLPBO* pboContext;
|
||||
|
||||
public:
|
||||
void mapTextureData(GLuint _textureId,
|
||||
GLsizei _width,
|
||||
GLsizei _height,
|
||||
GLsizei _stride,
|
||||
GLsizei _channels,
|
||||
const GLTexture::Type& type,
|
||||
std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const GLTexture::Type& type)> process);
|
||||
|
||||
static GLPBO* getContext();
|
||||
};
|
@ -1,58 +0,0 @@
|
||||
|
||||
#include "GLPlainTexture.h"
|
||||
#include "GLPBO.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
|
||||
|
||||
#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
|
||||
|
||||
GLPlainTexture::GLPlainTexture(
|
||||
const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
|
||||
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
|
||||
// caffe2::Timer timer;
|
||||
// timer.Start();
|
||||
glGenTextures(1, &_textureId);
|
||||
glBindTexture(GL_TEXTURE_2D, _textureId);
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
|
||||
|
||||
gl_log(
|
||||
GL_VERBOSE,
|
||||
"GLPlainTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
|
||||
_textureId,
|
||||
_type.internalFormat,
|
||||
_type.format,
|
||||
_type.type);
|
||||
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, _filter);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, _filter);
|
||||
|
||||
#if GL_EXT_texture_border_clamp
|
||||
GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
|
||||
// Set the texture to use the border clamp wrapping mode.
|
||||
_wrap = GL_CLAMP_TO_BORDER_EXT;
|
||||
#endif
|
||||
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, _wrap);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, _wrap);
|
||||
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
// LOG(INFO) << "glTexImage2D takes " << timer.MilliSeconds() << " ms";
|
||||
}
|
||||
|
||||
GLPlainTexture::GLPlainTexture(
|
||||
const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
|
||||
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
|
||||
_textureId = textureID;
|
||||
isOwner = false;
|
||||
gl_log(
|
||||
GL_VERBOSE,
|
||||
"GLPlainTexture() - wrapped textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
|
||||
_textureId,
|
||||
_type.internalFormat,
|
||||
_type.format,
|
||||
_type.type);
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLContext.h"
|
||||
#include "GLTexture.h"
|
||||
|
||||
class GLPlainTexture : public GLTexture {
|
||||
private:
|
||||
bool isOwner = true;
|
||||
|
||||
public:
|
||||
GLPlainTexture(const Type& type,
|
||||
const void* input,
|
||||
GLsizei width,
|
||||
GLsizei height,
|
||||
bool use_padding = false,
|
||||
GLint filter = GL_NEAREST,
|
||||
GLint wrap = GL_CLAMP_TO_EDGE);
|
||||
|
||||
GLPlainTexture(const Type& type,
|
||||
const GLuint textureID,
|
||||
GLsizei width,
|
||||
GLsizei height,
|
||||
bool use_padding = false,
|
||||
GLint filter = GL_NEAREST,
|
||||
GLint wrap = GL_CLAMP_TO_EDGE);
|
||||
|
||||
~GLPlainTexture() {
|
||||
if (glIsTexture(_textureId)) {
|
||||
if (isOwner) {
|
||||
gl_log(GL_VERBOSE, "~GLPlainTexture() - deleting texture %d\n", _textureId);
|
||||
glDeleteTextures(1, &_textureId);
|
||||
}
|
||||
} else {
|
||||
gl_log(GL_ERR, "not deleting texture %d\n", _textureId);
|
||||
}
|
||||
}
|
||||
|
||||
GLuint name() const { return _textureId; };
|
||||
|
||||
GLenum target() const { return GL_TEXTURE_2D; };
|
||||
|
||||
bool flipped() const { return false; };
|
||||
};
|
@ -1,63 +0,0 @@
|
||||
|
||||
#include "GLPredictor.h"
|
||||
#include "GLContext.h"
|
||||
#include "rewrite_net.h"
|
||||
#include <vector>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <class T>
|
||||
void shareInputGLImage(Workspace* ws, const std::string& name, GLImageVector<T>* input) {
|
||||
auto* blob = ws->GetBlob(name);
|
||||
CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
|
||||
blob->ShareExternal<GLImageVector<T>>(input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
const GLImageVector<T>* extractOutputGLImage(Workspace* ws, const std::string& name) {
|
||||
auto* blob = ws->GetBlob(name);
|
||||
CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
|
||||
return &blob->template Get<GLImageVector<T>>();
|
||||
}
|
||||
|
||||
const NetDef create_gl_run_net(const NetDef& init_net,
|
||||
const NetDef& run_net,
|
||||
bool use_texture_input) {
|
||||
NetDef gl_run_net;
|
||||
if (!tryConvertToOpenGL(init_net, run_net, &gl_run_net, use_texture_input)) {
|
||||
CAFFE_THROW("Failed to convert model to OpenGL");
|
||||
}
|
||||
return gl_run_net;
|
||||
}
|
||||
|
||||
GLPredictor::GLPredictor(const NetDef& init_net,
|
||||
const NetDef& run_net,
|
||||
bool use_texture_input,
|
||||
Workspace* parent)
|
||||
: Predictor(init_net, create_gl_run_net(init_net, run_net, use_texture_input), parent) {}
|
||||
|
||||
GLPredictor::~GLPredictor() {}
|
||||
|
||||
template <class T>
|
||||
bool GLPredictor::run(std::vector<GLImageVector<T>*>& inputs,
|
||||
std::vector<const GLImageVector<T>*>* outputs) {
|
||||
const NetDef& run_net_ = Predictor::def();
|
||||
CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
|
||||
for (auto i = 0; i < inputs.size(); ++i) {
|
||||
shareInputGLImage<T>(Predictor::ws(), run_net_.external_input(i), inputs[i]);
|
||||
}
|
||||
|
||||
if (!Predictor::ws()->RunNet(run_net_.name())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (auto i = 0; i < run_net_.external_output_size(); ++i) {
|
||||
outputs->push_back(extractOutputGLImage<T>(Predictor::ws(), run_net_.external_output(i)));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template bool GLPredictor::run(std::vector<GLImageVector<uint8_t>*>& inputs,
|
||||
std::vector<const GLImageVector<uint8_t>*>* outputs);
|
||||
} // namespace caffe2
|
@ -1,21 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLImage.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/predictor/predictor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
class GLPredictor : public Predictor {
|
||||
public:
|
||||
GLPredictor(const NetDef& init_net,
|
||||
const NetDef& run_net,
|
||||
bool use_texture_input = false,
|
||||
Workspace* parent = nullptr);
|
||||
|
||||
template <class T>
|
||||
bool run(std::vector<GLImageVector<T>*>& inputs, std::vector<const GLImageVector<T>*>* outputs);
|
||||
|
||||
~GLPredictor();
|
||||
};
|
||||
} // namespace caffe2
|
@ -1,71 +0,0 @@
|
||||
|
||||
#include "GLTexture.h"
|
||||
#include "DataTransfer.h"
|
||||
#include "GLPBO.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
#if CAFFE2_ANDROID && defined(__ARM_NEON__)
|
||||
|
||||
#include "../android/AndroidGLContext.h"
|
||||
|
||||
// https://community.arm.com/thread/10002
|
||||
void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz) {
|
||||
if (sz & 63) {
|
||||
sz = (sz & -64) + 64;
|
||||
}
|
||||
|
||||
asm volatile(
|
||||
"NEONCopyPLD: \n"
|
||||
" VLDM %[src]!,{d0-d7} \n"
|
||||
" VSTM %[dst]!,{d0-d7} \n"
|
||||
" SUBS %[sz],%[sz],#0x40 \n"
|
||||
" BGT NEONCopyPLD \n"
|
||||
: [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
|
||||
:
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
|
||||
}
|
||||
#endif
|
||||
|
||||
const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
|
||||
const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
|
||||
const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
|
||||
|
||||
void GLTexture::map_read(std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const {
|
||||
GLPBO* pbo = GLPBO::getContext();
|
||||
pbo->mapTextureData(_textureId, _width, _height, _stride, _channels, _type, process);
|
||||
}
|
||||
|
||||
void GLTexture::map_load(std::function<void(void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const {
|
||||
const int alignment = 32; // 4 * _type.dataSize();
|
||||
void* buffer = nullptr;
|
||||
size_t buffer_size = _width * _height * _channels * _type.dataSize();
|
||||
|
||||
#ifdef __ANDROID__
|
||||
buffer = (void*)memalign(alignment, buffer_size);
|
||||
#else
|
||||
posix_memalign((void**)&buffer, alignment, buffer_size);
|
||||
#endif
|
||||
CAFFE_ENFORCE(buffer);
|
||||
|
||||
process(buffer, _width, _height, _width, _channels, _type);
|
||||
loadData(buffer);
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
void GLTexture::loadData(const void* pixels) const {
|
||||
glBindTexture(GL_TEXTURE_2D, _textureId);
|
||||
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, _width, _height, _type.format, _type.type, pixels);
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
#include "GL.h"
|
||||
#include "GLLogging.h"
|
||||
|
||||
class GLTexture {
|
||||
public:
|
||||
struct Type {
|
||||
const GLenum internalFormat;
|
||||
const GLenum format;
|
||||
const GLenum type;
|
||||
|
||||
int dataSize() const {
|
||||
switch (type) {
|
||||
case GL_UNSIGNED_INT:
|
||||
return 4;
|
||||
case GL_HALF_FLOAT:
|
||||
return 2;
|
||||
case GL_UNSIGNED_BYTE:
|
||||
return 1;
|
||||
default:
|
||||
throw std::runtime_error("Unknown Texture Type");
|
||||
}
|
||||
}
|
||||
|
||||
int channels() const {
|
||||
switch (format) {
|
||||
case GL_R8:
|
||||
return 1;
|
||||
case GL_RG8:
|
||||
return 2;
|
||||
// case GL_BGRA:
|
||||
case GL_RG_INTEGER:
|
||||
case GL_RGBA:
|
||||
return 4;
|
||||
default:
|
||||
throw std::runtime_error("Unknown Texture Format");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static const Type FP16;
|
||||
static const Type FP16_COMPAT;
|
||||
static const Type UI8;
|
||||
|
||||
protected:
|
||||
const Type& _type;
|
||||
|
||||
const GLsizei _width;
|
||||
const GLsizei _height;
|
||||
const GLsizei _stride;
|
||||
const GLsizei _channels;
|
||||
const bool _use_padding;
|
||||
|
||||
GLint _filter;
|
||||
GLint _wrap;
|
||||
GLuint _textureId;
|
||||
|
||||
public:
|
||||
GLTexture(const Type& type,
|
||||
int width,
|
||||
int height,
|
||||
int stride,
|
||||
bool use_padding,
|
||||
GLint filter,
|
||||
GLint wrap)
|
||||
: _type(type),
|
||||
_width(width),
|
||||
_height(height),
|
||||
_stride(stride),
|
||||
_channels(type.channels()),
|
||||
_use_padding(use_padding),
|
||||
_filter(filter),
|
||||
_wrap(wrap) {}
|
||||
|
||||
GLTexture(const Type& type, int width, int height, bool use_padding, GLint filter, GLint wrap)
|
||||
: GLTexture(type,
|
||||
width,
|
||||
height,
|
||||
use_padding ? (width + 7) / 8 * 8 : width,
|
||||
use_padding,
|
||||
filter,
|
||||
wrap) {}
|
||||
|
||||
virtual ~GLTexture() {}
|
||||
virtual GLuint name() const = 0;
|
||||
virtual GLenum target() const = 0;
|
||||
virtual bool flipped() const = 0;
|
||||
|
||||
virtual void map_read(std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const;
|
||||
|
||||
virtual void map_load(std::function<void(void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const;
|
||||
|
||||
void loadData(const void* pixels) const;
|
||||
};
|
@ -1,47 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GLImageAllocator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <class T>
|
||||
class ImageAllocator {
|
||||
GLImageAllocator<T>* glImageAllocator;
|
||||
|
||||
public:
|
||||
ImageAllocator() : glImageAllocator(GLImageAllocator<T>::newGLImageAllocator()) {}
|
||||
|
||||
virtual ~ImageAllocator() { delete glImageAllocator; }
|
||||
|
||||
GLImageVector<T>* newImage(
|
||||
int num_images, int width, int height, int channels, bool is_output = false) {
|
||||
const int tile_x = 1, tile_y = 1;
|
||||
return glImageAllocator->newImage(
|
||||
num_images, width, height, channels, tile_x, tile_y, is_output);
|
||||
}
|
||||
|
||||
GLImageVector<T>* newImage(int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y,
|
||||
bool is_output = false) {
|
||||
return glImageAllocator->newImage(
|
||||
num_images, width, height, channels, tile_x, tile_y, is_output);
|
||||
}
|
||||
|
||||
GLImageVector<T>* newImage(
|
||||
int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y,
|
||||
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
|
||||
return glImageAllocator->newImage(
|
||||
num_images, width, height, channels, tile_x, tile_y, textureAllocator);
|
||||
}
|
||||
};
|
||||
} // namespace caffe2
|
@ -1,12 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
#if CAFFE2_IOS
|
||||
#include "arm_neon.h"
|
||||
#elif CAFFE2_ANDROID
|
||||
#include "caffe2/mobile/contrib/opengl/android/arm_neon_support.h"
|
||||
#endif
|
||||
#endif
|
@ -1,367 +0,0 @@
|
||||
|
||||
#include "rewrite_net.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#ifdef CAFFE2_ANDROID
|
||||
#include "../android/AndroidGLContext.h"
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
struct Analysis {
|
||||
struct SSA {
|
||||
using BlobVersions = std::unordered_map<std::string, size_t>;
|
||||
BlobVersions inVersions;
|
||||
BlobVersions outVersions;
|
||||
};
|
||||
std::vector<SSA> ssa;
|
||||
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
|
||||
};
|
||||
|
||||
static Analysis analyzeNet(const NetDef& net) {
|
||||
Analysis::SSA::BlobVersions frontier;
|
||||
Analysis analysis;
|
||||
|
||||
auto play = [&](size_t i, const OperatorDef& op) {
|
||||
Analysis::SSA::BlobVersions inVersions;
|
||||
for (const auto& s : op.input()) {
|
||||
inVersions[s] = frontier[s];
|
||||
analysis.inUsages[s][frontier[s]].push_back(i);
|
||||
}
|
||||
Analysis::SSA::BlobVersions outVersions;
|
||||
for (const auto& s : op.output()) {
|
||||
if (frontier.find(s) != frontier.end()) {
|
||||
frontier[s] += 1;
|
||||
}
|
||||
outVersions[s] = frontier[s];
|
||||
}
|
||||
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
|
||||
};
|
||||
|
||||
for (auto i = 0; i < net.op_size(); ++i) {
|
||||
play(i, net.op(i));
|
||||
}
|
||||
return analysis;
|
||||
}
|
||||
|
||||
static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
|
||||
auto* op = predictNet.add_op();
|
||||
op->set_name("CopyToOpenGL");
|
||||
op->set_type("CopyToOpenGL");
|
||||
op->add_input(cpu_blob);
|
||||
op->add_output(cpu_blob + "_M");
|
||||
}
|
||||
|
||||
static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
|
||||
// add argument "is_last" to the last op to signal this is the last operator before the
|
||||
// CopyFromOpenGL op
|
||||
auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1);
|
||||
auto* arg = last_op->add_arg();
|
||||
arg->set_name("is_last");
|
||||
arg->set_i(1);
|
||||
|
||||
auto* op = predictNet.add_op();
|
||||
op->set_name("CopyFromOpenGL");
|
||||
op->set_type("CopyFromOpenGL");
|
||||
op->add_input(cpu_blob + "_M");
|
||||
op->add_output(cpu_blob);
|
||||
}
|
||||
|
||||
static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) {
|
||||
// Do some validation of the outputs. For this version, we require:
|
||||
// - a single input (first element of external_input()) is consumed by the NetDef
|
||||
// - a single output (first element of external_output()) is produced by the NetDef.
|
||||
// - the input is consumed by def.op(0), and this is the only consumer.
|
||||
// - the output is produced by def.op(-1).
|
||||
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
|
||||
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
|
||||
auto analysis = analyzeNet(def);
|
||||
// enforce a single use of the input blob.
|
||||
CAFFE_ENFORCE_GE(def.op_size(), 1);
|
||||
|
||||
const auto& inputBlob = def.external_input(0);
|
||||
// Enforce that the input blob has a single usage - in the first operator.
|
||||
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
|
||||
// Enforce that the external_output(0) blob is produced by the last operator in this sequence.
|
||||
const auto& outputBlob = def.external_output(0);
|
||||
CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) !=
|
||||
analysis.ssa.back().outVersions.end());
|
||||
const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
|
||||
// This should hold true by definition of the SSA analysis.
|
||||
CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
|
||||
analysis.inUsages[outputBlob].end());
|
||||
|
||||
NetDef mdef;
|
||||
mdef.CopyFrom(def);
|
||||
mdef.clear_op();
|
||||
|
||||
std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
|
||||
cpu_blobs[def.external_input(0)].insert(0);
|
||||
|
||||
for (auto i = 0; i < def.op_size(); i++) {
|
||||
const auto& currentOp = def.op(i);
|
||||
if (glOps.count(currentOp.type()) > 0) {
|
||||
// OpenGL Op
|
||||
// insert copyToOpenGLOp
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto& input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (cpu_blobs[input].count(version) > 0) {
|
||||
insertCopyToGPUOp(mdef, input);
|
||||
gpu_blobs[input].insert(version);
|
||||
cpu_blobs[input].erase(version);
|
||||
}
|
||||
// Only the first input should be OpenGL texture
|
||||
// Otherwise, copyToOpenGLOp will be inserted for the weights,
|
||||
// which are outputs of QuantDecode
|
||||
if (currentOp.type().find("OpenGLConv") == 0) {
|
||||
if (j == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
|
||||
// swap input blob
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto& input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (gpu_blobs[input].count(version) > 0) {
|
||||
op->set_input(j, input + "_M");
|
||||
}
|
||||
}
|
||||
|
||||
// swap output blob
|
||||
for (auto j = 0; j < currentOp.output_size(); j++) {
|
||||
auto& output = currentOp.output(j);
|
||||
auto version = analysis.ssa[i].outVersions[output];
|
||||
op->set_output(j, output + "_M");
|
||||
gpu_blobs[output].insert(version);
|
||||
}
|
||||
// insert copyFromOpenGLOp after the last op if the last op is an OpenGL op
|
||||
if (i == def.op_size() - 1) {
|
||||
insertCopyFromGPUOp(mdef, currentOp.output(0));
|
||||
}
|
||||
} else {
|
||||
// CPU Op
|
||||
// insert copyFromOpenGLOp
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto& input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (gpu_blobs[input].count(version) > 0) {
|
||||
insertCopyFromGPUOp(mdef, input);
|
||||
}
|
||||
}
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
for (auto j = 0; j < currentOp.output_size(); j++) {
|
||||
auto& output = currentOp.output(j);
|
||||
auto version = analysis.ssa[i].outVersions[output];
|
||||
cpu_blobs[output].insert(version);
|
||||
}
|
||||
}
|
||||
}
|
||||
return mdef;
|
||||
}
|
||||
|
||||
static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
|
||||
const OperatorDef& nextOp,
|
||||
OperatorDef* fusedOp,
|
||||
std::unordered_set<std::string>& glOps) {
|
||||
// Check for possible invalid opportunities.
|
||||
if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
|
||||
return false;
|
||||
}
|
||||
// The fused op cannot be inplace
|
||||
if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
|
||||
{{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"},
|
||||
{{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"},
|
||||
{{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"},
|
||||
{{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}};
|
||||
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
|
||||
if (it == fusionOpportunities.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
glOps.insert(it->second);
|
||||
fusedOp->CopyFrom(currentOp);
|
||||
fusedOp->set_output(0, nextOp.output(0));
|
||||
fusedOp->set_type(it->second);
|
||||
for (auto i = 1; i < nextOp.input_size(); i++) {
|
||||
fusedOp->add_input(nextOp.input(i));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
|
||||
CHECK_GE(def.op_size(), 1);
|
||||
NetDef mdef;
|
||||
mdef.CopyFrom(def);
|
||||
mdef.clear_op();
|
||||
auto i = 0;
|
||||
|
||||
while (i < def.op_size()) {
|
||||
if (i == def.op_size() - 1) {
|
||||
VLOG(2) << "Last operator, skipping";
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(def.op(i));
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto& currentOp = def.op(i);
|
||||
const auto& nextOp = def.op(i + 1);
|
||||
OperatorDef fusedOp;
|
||||
if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
|
||||
VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
|
||||
// We can fuse.
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(fusedOp);
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
|
||||
// Just emit the current type.
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
i += 1;
|
||||
}
|
||||
return mdef;
|
||||
}
|
||||
|
||||
void dumpDefForOpenGL(const NetDef& d) {
|
||||
for (const auto& op : d.op()) {
|
||||
LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
|
||||
}
|
||||
}
|
||||
|
||||
// // For debugging
|
||||
// void dumpDefForOpenGL(const NetDef &net) {
|
||||
// for (const auto &op : net.op()) {
|
||||
// printf("***Operator: %s\n", op.type().c_str());
|
||||
// for (auto input : op.input()) {
|
||||
// printf("\tInput: %s\n", input.c_str());
|
||||
// }
|
||||
//
|
||||
// for (auto output : op.output()) {
|
||||
// printf("\tOutput: %s\n", output.c_str());
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
|
||||
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
|
||||
NetDef net;
|
||||
net.CopyFrom(predictNet);
|
||||
|
||||
std::unordered_map<std::string, std::string> replacements(
|
||||
{{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess",
|
||||
useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess"
|
||||
: "OpenGLTensorToTextureStylizerPreprocess"},
|
||||
{"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess",
|
||||
useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess"
|
||||
: "OpenGLTextureToTensorStylizerDeprocess"}});
|
||||
|
||||
std::unordered_set<std::string> openGLOps; // Used to insert copy ops
|
||||
bool needCopyOps = false;
|
||||
|
||||
const auto& opKeyList = CPUOperatorRegistry()->Keys();
|
||||
auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
|
||||
|
||||
#ifdef CAFFE2_ANDROID
|
||||
// TODO: debug InstanceNorm models on Mali devices
|
||||
AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext();
|
||||
if (context->get_platform() == Mali) {
|
||||
opKeySet.erase("OpenGLInstanceNorm");
|
||||
opKeySet.erase("OpenGLInstanceNormPRelu");
|
||||
}
|
||||
#endif
|
||||
for (auto i = 0; i < net.op_size(); ++i) {
|
||||
auto* op = net.mutable_op(i);
|
||||
string openGLOp = std::string("OpenGL") + op->type();
|
||||
if (replacements.count(openGLOp) > 0) {
|
||||
openGLOp = replacements[openGLOp];
|
||||
}
|
||||
|
||||
if (opKeySet.find(openGLOp) != opKeySet.end()) {
|
||||
op->set_type(openGLOp);
|
||||
openGLOps.insert(openGLOp);
|
||||
|
||||
if (useTiling) {
|
||||
auto* arg = op->add_arg();
|
||||
arg->set_name("tiling");
|
||||
arg->set_i(1);
|
||||
}
|
||||
} else {
|
||||
needCopyOps = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (useTextureInput && needCopyOps) {
|
||||
CAFFE_THROW("OpenGL operator missing");
|
||||
}
|
||||
|
||||
if (runFusion) {
|
||||
net = runOpenGLFusion(net, openGLOps);
|
||||
}
|
||||
|
||||
if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
|
||||
// For end-to-end testing
|
||||
if (net.op(net.op_size() - 1).type() !=
|
||||
replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) {
|
||||
auto* last_op = net.mutable_op(net.op_size() - 1);
|
||||
auto output = last_op->output(0) + "M";
|
||||
last_op->set_output(0, output);
|
||||
auto* copy_op = net.add_op();
|
||||
copy_op->set_name("CopyFromOpenGL");
|
||||
copy_op->set_type("CopyFromOpenGL");
|
||||
copy_op->add_input(output);
|
||||
// rename output blob in case input and output blob has the same name
|
||||
copy_op->add_output(net.external_output(0));
|
||||
}
|
||||
} else {
|
||||
if (!useTextureInput) {
|
||||
needCopyOps = true;
|
||||
}
|
||||
}
|
||||
|
||||
// copy ops are needed when the input is not a texture
|
||||
if (needCopyOps) {
|
||||
// For non style transfer cases
|
||||
net = insertInputOutputCopyOps(net, openGLOps);
|
||||
}
|
||||
|
||||
return net;
|
||||
}
|
||||
|
||||
bool tryConvertToOpenGL(const NetDef& initNet,
|
||||
const NetDef& predictNet,
|
||||
NetDef* glPredictNet,
|
||||
bool useTextureInput,
|
||||
bool useTiling,
|
||||
bool runFusion) {
|
||||
try {
|
||||
// Throws if unsupported operators are found.
|
||||
*glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
|
||||
dumpDefForOpenGL(*glPredictNet);
|
||||
// Throws if unsupported parameters are found.
|
||||
Workspace ws;
|
||||
ws.RunNetOnce(initNet);
|
||||
ws.CreateNet(*glPredictNet);
|
||||
LOG(INFO) << "OpenGL is successfully enabled";
|
||||
return true;
|
||||
} catch (const std::exception& e) {
|
||||
LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,20 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
#include "GLPredictor.h"
|
||||
#include "caffe2/predictor/predictor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
bool tryConvertToOpenGL(const NetDef& initNet,
|
||||
const NetDef& predictNet,
|
||||
NetDef* glPredictNet,
|
||||
bool useTextureInput = false,
|
||||
bool useTiling = false,
|
||||
bool runFusion = true);
|
||||
|
||||
// Exposed for testing
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
|
||||
bool useTextureInput = false,
|
||||
bool useTiling = false,
|
||||
bool runFusion = true);
|
||||
void dumpDefForOpenGL(const NetDef& net);
|
||||
} // namespace caffe2
|
@ -1,2 +0,0 @@
|
||||
file(GLOB_RECURSE tmp *.mm *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,19 +0,0 @@
|
||||
|
||||
#include "IOSGLContext.h"
|
||||
|
||||
std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
|
||||
|
||||
void GLContext::initGLContext() {
|
||||
if (_glcontext == nullptr) {
|
||||
_glcontext.reset(new IOSGLContext());
|
||||
}
|
||||
}
|
||||
|
||||
GLContext* GLContext::getGLContext() {
|
||||
if (_glcontext == nullptr) {
|
||||
initGLContext();
|
||||
}
|
||||
return _glcontext.get();
|
||||
}
|
||||
|
||||
void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
|
@ -1,11 +0,0 @@
|
||||
|
||||
#include "IOSGLImageAllocator.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
template <typename T>
|
||||
GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
|
||||
return new IOSGLImageAllocator<T>();
|
||||
}
|
||||
|
||||
template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
|
||||
template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
|
@ -1,22 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../core/GLContext.h"
|
||||
#include "../core/GLTexture.h"
|
||||
|
||||
#import <CoreVideo/CoreVideo.h>
|
||||
|
||||
class IOSGLContext : public GLContext {
|
||||
void* oglContext;
|
||||
void* oldContext;
|
||||
CVOpenGLESTextureCacheRef textureCache;
|
||||
|
||||
public:
|
||||
IOSGLContext();
|
||||
~IOSGLContext();
|
||||
|
||||
const GLTexture* createNewTexture(CVPixelBufferRef pixelBuffer, const GLTexture::Type& type);
|
||||
void set_context();
|
||||
void reset_context();
|
||||
void flush_context();
|
||||
};
|
@ -1,98 +0,0 @@
|
||||
|
||||
#include "IOSGLContext.h"
|
||||
#include "IOSGLTexture.h"
|
||||
#import <sstream>
|
||||
|
||||
#import <OpenGLES/EAGL.h>
|
||||
|
||||
IOSGLContext::IOSGLContext() {
|
||||
auto const currentContext = [EAGLContext currentContext];
|
||||
oldContext = (void*)CFBridgingRetain(currentContext);
|
||||
|
||||
if (currentContext != nil && [currentContext API] == kEAGLRenderingAPIOpenGLES3) {
|
||||
oglContext = (void*)CFBridgingRetain(currentContext);
|
||||
|
||||
gl_log(GL_LOG, "Reusing current context %p\n", oglContext);
|
||||
} else {
|
||||
oglContext =
|
||||
(void*)CFBridgingRetain([[EAGLContext alloc] initWithAPI:kEAGLRenderingAPIOpenGLES3]);
|
||||
|
||||
gl_log(GL_LOG, "Created a new context %p\n", oglContext);
|
||||
}
|
||||
|
||||
if (!oglContext) {
|
||||
throw std::runtime_error("Problem with OpenGL context");
|
||||
}
|
||||
|
||||
set_context();
|
||||
textureCache = NULL;
|
||||
CVReturn err = CVOpenGLESTextureCacheCreate(
|
||||
kCFAllocatorDefault, NULL, (__bridge EAGLContext*)oglContext, NULL, &textureCache);
|
||||
|
||||
if (err) {
|
||||
std::stringstream errmsg;
|
||||
errmsg << "Error at CVOpenGLESTextureCacheCreate " << err;
|
||||
throw std::runtime_error(errmsg.str());
|
||||
}
|
||||
}
|
||||
|
||||
IOSGLContext::~IOSGLContext() {
|
||||
gl_log(GL_VERBOSE, "~IOSGLContext()");
|
||||
|
||||
set_context();
|
||||
if (textureCache) {
|
||||
CFRelease(textureCache);
|
||||
textureCache = 0;
|
||||
}
|
||||
reset_context();
|
||||
|
||||
// Explicitly release only after we `reset_context` since otherwise we are going to read from a
|
||||
// dangling pointer.
|
||||
if (oglContext) {
|
||||
CFBridgingRelease(oglContext);
|
||||
}
|
||||
if (oldContext) {
|
||||
CFBridgingRelease(oldContext);
|
||||
}
|
||||
}
|
||||
|
||||
const GLTexture* IOSGLContext::createNewTexture(CVPixelBufferRef pixelBuffer,
|
||||
const GLTexture::Type& type) {
|
||||
return new IOSGLTexture(type, textureCache, pixelBuffer);
|
||||
}
|
||||
|
||||
void IOSGLContext::set_context() {
|
||||
auto const currentContext = [EAGLContext currentContext];
|
||||
|
||||
if ((__bridge void*)currentContext != oglContext) {
|
||||
if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oglContext]) {
|
||||
throw std::runtime_error("Problem setting OpenGL context");
|
||||
}
|
||||
GLenum glError = glGetError();
|
||||
if (glError != GL_NO_ERROR) {
|
||||
gl_log(GL_ERR, "There is an error: 0x%X\n", glError);
|
||||
}
|
||||
gl_log(GL_VERBOSE, "Set context to %p\n", oglContext);
|
||||
}
|
||||
}
|
||||
|
||||
void IOSGLContext::reset_context() {
|
||||
EAGLContext* currentContext = [EAGLContext currentContext];
|
||||
|
||||
if ((__bridge void*)currentContext != oldContext) {
|
||||
GLenum glError = glGetError();
|
||||
if (glError != GL_NO_ERROR) {
|
||||
gl_log(GL_ERR, "There is an error before: 0x%X\n", glError);
|
||||
}
|
||||
if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oldContext]) {
|
||||
throw std::runtime_error("Problem setting OpenGL context");
|
||||
}
|
||||
glError = glGetError();
|
||||
if (glError != GL_NO_ERROR) {
|
||||
gl_log(GL_ERR, "There is an error after: 0x%X\n", glError);
|
||||
}
|
||||
gl_log(GL_VERBOSE, "Reset context to %p\n", oldContext);
|
||||
}
|
||||
}
|
||||
|
||||
void IOSGLContext::flush_context() { CVOpenGLESTextureCacheFlush(textureCache, 0); }
|
@ -1,78 +0,0 @@
|
||||
|
||||
#include "IOSGLImageAllocator.h"
|
||||
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/GLImageAllocator.h"
|
||||
#include "../core/GLPlainTexture.h"
|
||||
|
||||
#include "IOSGLContext.h"
|
||||
#include "IOSGLTexture.h"
|
||||
|
||||
#include "../core/arm_neon_support.h"
|
||||
|
||||
template <class T>
|
||||
GLImageVector<T>* IOSGLImageAllocator<T>::newImage(int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y,
|
||||
bool useCVPixelBuffer) {
|
||||
GLImageVector<T>* output_images =
|
||||
new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
|
||||
if (useCVPixelBuffer) {
|
||||
IOSGLContext* gl_context = (IOSGLContext*)GLContext::getGLContext();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* output_image = new GLImage<T>(
|
||||
width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
|
||||
gl_log(GL_VERBOSE,
|
||||
"%s pixelbuffers.size(): %ld\n",
|
||||
__PRETTY_FUNCTION__,
|
||||
pixelbuffers.size());
|
||||
|
||||
CVPixelBufferRef buffer = NULL;
|
||||
int slices = (channels + 3) / 4;
|
||||
int slice_index = i * slices + slice;
|
||||
if (pixelbuffers.size() < slice_index + 1) {
|
||||
const int texture_width = width * tile_x;
|
||||
const int texture_height = height * tile_y;
|
||||
buffer =
|
||||
IOSGLTexture::createCVPixelBuffer(pixelFormat, texture_width, texture_height);
|
||||
gl_log(GL_VERBOSE,
|
||||
"created a new buffer %p for image %d slice %d of dimensions %dx%d\n",
|
||||
buffer,
|
||||
i,
|
||||
slice,
|
||||
texture_width,
|
||||
texture_height);
|
||||
pixelbuffers.push_back(buffer);
|
||||
} else {
|
||||
buffer = pixelbuffers[slice_index];
|
||||
|
||||
gl_log(GL_VERBOSE, "reused buffer %p for image %d slice %d\n", buffer, i, slice);
|
||||
}
|
||||
|
||||
return gl_context->createNewTexture(buffer, GLImageAllocator<T>::type);
|
||||
});
|
||||
output_images->push_back(output_image);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* image = new GLImage<T>(
|
||||
width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
|
||||
return new GLPlainTexture(
|
||||
GLImageAllocator<T>::type, nullptr, width * tile_x, height * tile_y);
|
||||
});
|
||||
output_images->push_back(image);
|
||||
}
|
||||
}
|
||||
return output_images;
|
||||
}
|
||||
|
||||
template <>
|
||||
const FourCharCode IOSGLImageAllocator<float16_t>::pixelFormat = kCVPixelFormatType_64RGBAHalf;
|
||||
template <>
|
||||
const FourCharCode IOSGLImageAllocator<uint8_t>::pixelFormat = kCVPixelFormatType_32BGRA;
|
||||
|
||||
template class IOSGLImageAllocator<float16_t>;
|
||||
template class IOSGLImageAllocator<uint8_t>;
|
@ -1,34 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../core/GLImageAllocator.h"
|
||||
|
||||
#import <CoreVideo/CoreVideo.h>
|
||||
|
||||
template <class T>
|
||||
class IOSGLImageAllocator : public GLImageAllocator<T> {
|
||||
static const GLTexture::Type& type;
|
||||
|
||||
std::vector<CVPixelBufferRef> pixelbuffers;
|
||||
|
||||
public:
|
||||
static const FourCharCode pixelFormat;
|
||||
|
||||
IOSGLImageAllocator() : GLImageAllocator<T>() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
|
||||
|
||||
~IOSGLImageAllocator() {
|
||||
gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__);
|
||||
|
||||
for (auto&& pixelbuffer : pixelbuffers) {
|
||||
CFRelease(pixelbuffer);
|
||||
}
|
||||
}
|
||||
|
||||
GLImageVector<T>* newImage(int num_images,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
int tile_x,
|
||||
int tile_y,
|
||||
bool useCVPixelBuffer);
|
||||
};
|
@ -1,51 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../core/GLContext.h"
|
||||
#include "../core/GLTexture.h"
|
||||
|
||||
#import <CoreVideo/CoreVideo.h>
|
||||
|
||||
class IOSGLTexture : public GLTexture {
|
||||
CVOpenGLESTextureRef textureRef;
|
||||
|
||||
IOSGLTexture(const Type& type,
|
||||
CVOpenGLESTextureCacheRef textureCache,
|
||||
CVPixelBufferRef sourceImage,
|
||||
GLint _filter = GL_NEAREST,
|
||||
GLint _wrap = GL_CLAMP_TO_EDGE);
|
||||
|
||||
friend class IOSGLContext;
|
||||
|
||||
public:
|
||||
const CVPixelBufferRef sourceImage;
|
||||
|
||||
~IOSGLTexture() { CFRelease(textureRef); }
|
||||
|
||||
void map_buffer(std::function<void(void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const;
|
||||
|
||||
virtual void map_read(std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const;
|
||||
|
||||
virtual void map_load(std::function<void(void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const;
|
||||
|
||||
GLuint name() const { return CVOpenGLESTextureGetName(textureRef); }
|
||||
GLenum target() const { return CVOpenGLESTextureGetTarget(textureRef); };
|
||||
bool flipped() const { return CVOpenGLESTextureIsFlipped(textureRef); };
|
||||
|
||||
static CVPixelBufferRef createCVPixelBuffer(OSType pixelType, int32_t width, int32_t height);
|
||||
};
|
@ -1,121 +0,0 @@
|
||||
|
||||
#include "IOSGLTexture.h"
|
||||
#include "../core/DataTransfer.h"
|
||||
|
||||
IOSGLTexture::IOSGLTexture(const Type& type,
|
||||
CVOpenGLESTextureCacheRef textureCache,
|
||||
CVPixelBufferRef _sourceImage,
|
||||
GLint filter,
|
||||
GLint wrap)
|
||||
: GLTexture(type,
|
||||
CVPixelBufferGetWidth(_sourceImage),
|
||||
CVPixelBufferGetHeight(_sourceImage),
|
||||
CVPixelBufferGetBytesPerRow(_sourceImage) / (type.channels() * type.dataSize()),
|
||||
false,
|
||||
filter,
|
||||
wrap),
|
||||
sourceImage(_sourceImage) {
|
||||
CVReturn err = CVOpenGLESTextureCacheCreateTextureFromImage(kCFAllocatorDefault,
|
||||
textureCache,
|
||||
_sourceImage,
|
||||
NULL,
|
||||
GL_TEXTURE_2D,
|
||||
_type.internalFormat,
|
||||
_width,
|
||||
_height,
|
||||
_type.format,
|
||||
_type.type,
|
||||
0,
|
||||
&textureRef);
|
||||
|
||||
if (!textureRef || err) {
|
||||
gl_log(GL_ERR,
|
||||
"something went wrong, sourceImage: %p, width: %d, height: %d, filter: %d, wrap: %d\n",
|
||||
_sourceImage,
|
||||
_width,
|
||||
_height,
|
||||
filter,
|
||||
wrap);
|
||||
}
|
||||
_textureId = name();
|
||||
gl_log(
|
||||
GL_VERBOSE,
|
||||
"IOSGLTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
|
||||
_textureId,
|
||||
_type.internalFormat,
|
||||
_type.format,
|
||||
_type.type);
|
||||
|
||||
glActiveTexture(GL_TEXTURE0);
|
||||
glBindTexture(GL_TEXTURE_2D, _textureId);
|
||||
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
|
||||
|
||||
#if GL_EXT_texture_border_clamp
|
||||
GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
|
||||
// Set the texture to use the border clamp wrapping mode.
|
||||
wrap = GL_CLAMP_TO_BORDER_EXT;
|
||||
#endif
|
||||
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, wrap);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, wrap);
|
||||
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
}
|
||||
|
||||
CVPixelBufferRef IOSGLTexture::createCVPixelBuffer(OSType pixelFormat,
|
||||
int32_t width,
|
||||
int32_t height) {
|
||||
NSDictionary* pixelBufferAttributes = @{
|
||||
(id)kCVPixelBufferPixelFormatTypeKey : @(pixelFormat),
|
||||
(id)kCVPixelFormatOpenGLESCompatibility : @YES,
|
||||
(id)kCVPixelBufferIOSurfacePropertiesKey : @{/*empty dictionary*/}
|
||||
};
|
||||
|
||||
CVPixelBufferRef buffer = NULL;
|
||||
CVPixelBufferCreate(kCFAllocatorDefault,
|
||||
width,
|
||||
height,
|
||||
pixelFormat,
|
||||
(__bridge CFDictionaryRef)(pixelBufferAttributes),
|
||||
&buffer);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void IOSGLTexture::map_buffer(std::function<void(void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const {
|
||||
if (CVPixelBufferLockBaseAddress(sourceImage, 0) == kCVReturnSuccess) {
|
||||
void* buffer = CVPixelBufferGetBaseAddress(sourceImage);
|
||||
int buffer_stride = CVPixelBufferGetBytesPerRow(sourceImage) / (_channels * _type.dataSize());
|
||||
process(buffer, _width, _height, buffer_stride, _channels, _type);
|
||||
|
||||
CVPixelBufferUnlockBaseAddress(sourceImage, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void IOSGLTexture::map_load(std::function<void(void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const {
|
||||
map_buffer(process);
|
||||
}
|
||||
|
||||
void IOSGLTexture::map_read(std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const Type& type)> process) const {
|
||||
// TODO: why is glFlush() only necessary when running tests
|
||||
glFlush();
|
||||
|
||||
map_buffer(process);
|
||||
}
|
@ -1,2 +0,0 @@
|
||||
file(GLOB_RECURSE tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,143 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLAdd : public GLFilter {
|
||||
public:
|
||||
binding* inputData[2];
|
||||
binding* outputSize;
|
||||
|
||||
GLAdd()
|
||||
: GLFilter("GLAdd",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>(
|
||||
{BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{/* no replacements */}) {}
|
||||
|
||||
template <typename T>
|
||||
void add(const GLImageVector<T>& input_image0,
|
||||
const GLImageVector<T>& input_image1,
|
||||
const GLImageVector<T>& output_image);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLAdd::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData[2]);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
|
||||
vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
|
||||
vec4 value = A + B;
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLAdd::add(const GLImageVector<T>& input_images0,
|
||||
const GLImageVector<T>& input_images1,
|
||||
const GLImageVector<T>& output_images) {
|
||||
const int num_images = input_images0.size();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* input_image0 = input_images0[i];
|
||||
GLImage<T>* input_image1 = input_images1[i];
|
||||
int input_slices = input_image0->slices;
|
||||
GLImage<T>* output_image = output_images[i];
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments;
|
||||
input_attachments.push_back({input_image0->textures[is], inputData[0]});
|
||||
input_attachments.push_back({input_image1->textures[is], inputData[1]});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <typename T>
|
||||
class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLAddOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
|
||||
"OpenGLAdd does not support broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(input0.size(), input1.size());
|
||||
|
||||
const int num_images = input0.size();
|
||||
const int input_channels = input0.channels();
|
||||
const int input_width = input0.width();
|
||||
const int input_height = input0.height();
|
||||
const int input_tile_x = input0.tile_x();
|
||||
const int input_tile_y = input0.tile_y();
|
||||
|
||||
CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
|
||||
CAFFE_ENFORCE_EQ(input1.width(), input_width);
|
||||
CAFFE_ENFORCE_EQ(input1.height(), input_height);
|
||||
CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
|
||||
CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
const int output_tile_x = input_tile_x;
|
||||
const int output_tile_y = input_tile_y;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
|
||||
if (!_add) {
|
||||
_add.reset(new GLAdd());
|
||||
}
|
||||
|
||||
_add->add(input0, input1, *output);
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLAdd> _add;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLAdd, OpenGLAddOp<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLAdd).NumInputs(2).NumOutputs(1);
|
||||
} // namespace caffe2
|
@ -1,202 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
#include "gl_tiling_utils.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
class GLConcat : public GLFilter {
|
||||
public:
|
||||
bool tiling_;
|
||||
binding* inputData;
|
||||
binding* outputSize;
|
||||
binding* inputTileRange;
|
||||
binding* input_tile_x;
|
||||
|
||||
GLConcat(tile_descriptor output_tile_geometries, bool tiling = false)
|
||||
: GLFilter(
|
||||
"GLConcat",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize),
|
||||
BINDING(inputData),
|
||||
BINDING(inputTileRange),
|
||||
BINDING(input_tile_x)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"TILING", c10::to_string(tiling)},
|
||||
{"OUTPUT_TILES", c10::to_string(output_tile_geometries.tiles)},
|
||||
{"OUTPUT_TILE_X",
|
||||
c10::to_string(output_tile_geometries.tile_dims.x)},
|
||||
{"OUTPUT_TILE_WIDTH",
|
||||
c10::to_string(output_tile_geometries.tile_size.x)},
|
||||
{"OUTPUT_TILE_HEIGHT",
|
||||
c10::to_string(output_tile_geometries.tile_size.y)}}),
|
||||
tiling_(tiling) {}
|
||||
|
||||
template <typename T>
|
||||
void concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_image, int size);
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLConcat::fragment_shader = R"GLSL(#version 300 es
|
||||
#define TILING $(TILING)
|
||||
|
||||
// tiling
|
||||
#define OUTPUT_TILES $(OUTPUT_TILES)
|
||||
#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
|
||||
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
|
||||
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
uniform ivec2 inputTileRange; // (]
|
||||
uniform int input_tile_x;
|
||||
|
||||
#if TILING
|
||||
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
|
||||
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
|
||||
|
||||
if (tileNum >= inputTileRange.x && tileNum < inputTileRange.y) {
|
||||
tileNum = tileNum - inputTileRange.x;
|
||||
texelCoord = ivec2(tileNum % input_tile_x, tileNum / input_tile_x) * ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT) + tileCoord;
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(value);
|
||||
} else {
|
||||
// early termination
|
||||
discard;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
#endif
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLConcat::concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_images, int input_size) {
|
||||
for (int k = 0; k < output_images.size(); k++) {
|
||||
GLImage<T>* output_image = output_images[k];
|
||||
|
||||
int is = 0, os = 0;
|
||||
for (int i = 0; i < input_size; i++) {
|
||||
for (int j = 0; j < input_images[i]->slices(); j++) {
|
||||
GLImage<T>* input_image = (*input_images[i])[k];
|
||||
std::vector<texture_attachment> input_attachments;
|
||||
input_attachments.push_back({input_image->textures[j], inputData});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + os, output_image->textures.begin() + os + 1},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
|
||||
glUniform2i(inputTileRange->location, is, is + input_image->tile_x * input_image->tile_y);
|
||||
glUniform1i(input_tile_x->location, input_image->tile_x);
|
||||
},
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
if (!tiling_) {
|
||||
os++; // for tiling, you always write to the same texture
|
||||
}
|
||||
is += input_image->tile_x * input_image->tile_y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <typename T>
|
||||
class OpenGLConcatOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLConcatOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input0.size();
|
||||
|
||||
const GLImageVector<T>** input_images = new const GLImageVector<T>*[Inputs().size()];
|
||||
input_images[0] = &input0;
|
||||
int channelCount = input0.channels();
|
||||
|
||||
bool tiling = OperatorBase::GetSingleArgument<int>("tiling", 0);
|
||||
|
||||
// Only supports input channels divisible by 4 for now
|
||||
CAFFE_ENFORCE_EQ(input0.channels() % 4, 0);
|
||||
for (auto i = 1; i < Inputs().size(); i++) {
|
||||
const GLImageVector<T>& inputi = Inputs()[i]->template Get<GLImageVector<T>>();
|
||||
channelCount += inputi.channels();
|
||||
CAFFE_ENFORCE_EQ(num_images, inputi.size());
|
||||
CAFFE_ENFORCE_EQ(inputi.channels() % 4, 0);
|
||||
CAFFE_ENFORCE_EQ(input0.width(), inputi.width());
|
||||
CAFFE_ENFORCE_EQ(input0.height(), inputi.height());
|
||||
input_images[i] = &inputi;
|
||||
|
||||
if (inputi.tile_x() > 1 || inputi.tile_y() > 1) {
|
||||
tiling = true;
|
||||
}
|
||||
}
|
||||
|
||||
const int input_width = input0.width();
|
||||
const int input_height = input0.height();
|
||||
|
||||
const int output_channels = channelCount;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int output_tile_x = 1;
|
||||
int output_tile_y = 1;
|
||||
if (tiling) {
|
||||
computeOutputTiles(output_channels, output_tile_x, output_tile_y);
|
||||
}
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
if (!_concat) {
|
||||
tile_descriptor output_tile_geometries{
|
||||
{output_tile_x, output_tile_y}, {output_width, output_height}, output_tile_x * output_tile_y};
|
||||
_concat.reset(new GLConcat(output_tile_geometries, tiling));
|
||||
}
|
||||
|
||||
_concat->concat(input_images, *output, Inputs().size());
|
||||
delete[] input_images;
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
StorageOrder order_;
|
||||
std::unique_ptr<GLConcat> _concat;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLConcat, OpenGLConcatOp<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLConcat).NumInputs(2, 4).NumOutputs(1, 2);
|
||||
} // namespace caffe2
|
File diff suppressed because it is too large
Load Diff
@ -1,232 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "gl_tiling_utils.h"
|
||||
|
||||
class GLConvolution : public GLFilter {
|
||||
public:
|
||||
static constexpr int MaxInputBatchSize = 8;
|
||||
static constexpr int MaxOutputBatchSize = 4;
|
||||
|
||||
struct descriptor {
|
||||
int input_channels;
|
||||
int output_channels;
|
||||
point kernel_size;
|
||||
point input_tile_size;
|
||||
point output_tile_size;
|
||||
point input_tile_grid_size;
|
||||
point output_tile_grid_size;
|
||||
point input_padding;
|
||||
point input_stride;
|
||||
bool transposed;
|
||||
};
|
||||
|
||||
const float* kernel;
|
||||
const float* bias;
|
||||
const float* prelu_scale;
|
||||
|
||||
binding* inputData[MaxInputBatchSize];
|
||||
binding* previousData[MaxOutputBatchSize];
|
||||
binding* outputSize;
|
||||
binding* accumulate;
|
||||
binding* fusePRelu;
|
||||
binding* kernel_block[MaxInputBatchSize];
|
||||
binding* bias_block;
|
||||
binding* prelu_scale_block;
|
||||
binding* inputTileRange;
|
||||
|
||||
const descriptor geometry;
|
||||
const int prelu_scale_size;
|
||||
const int input_batch_size;
|
||||
const int output_batch_size;
|
||||
const int input_tiles;
|
||||
const int output_tiles;
|
||||
const int input_tile_chunk_size;
|
||||
const int output_tile_chunk_size;
|
||||
const int input_tile_batch_size;
|
||||
const int output_tile_batch_size;
|
||||
const bool tiling;
|
||||
|
||||
static const char* fragment_shader;
|
||||
|
||||
GLConvolution(
|
||||
const descriptor& _geometry,
|
||||
const float* _kernel,
|
||||
const float* _bias,
|
||||
const float* _prelu_scale = nullptr,
|
||||
int _prelu_scale_size = 0,
|
||||
int _input_batch_size = 1,
|
||||
int _output_batch_size = 1,
|
||||
int _input_tiles = 1,
|
||||
int _output_tiles = 1,
|
||||
int _input_tile_chunk_size = 1,
|
||||
int _output_tile_chunk_size = 1,
|
||||
int _input_tile_batch_size = 1,
|
||||
int _output_tile_batch_size = 1,
|
||||
bool _tiling = false)
|
||||
: GLFilter(
|
||||
"GLConvolution",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
input_bindings(_input_batch_size, _output_batch_size),
|
||||
uniform_blocks_bindings(
|
||||
_input_batch_size,
|
||||
_output_batch_size,
|
||||
_output_tile_batch_size,
|
||||
_prelu_scale != nullptr),
|
||||
{/* no attributes */},
|
||||
{{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
|
||||
{"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
|
||||
{"INPUT_BATCH_SIZE", c10::to_string(_input_batch_size)},
|
||||
{"OUTPUT_BATCH_SIZE", c10::to_string(_output_batch_size)},
|
||||
{"INPUT_TILES", c10::to_string(_input_tiles)},
|
||||
{"OUTPUT_TILES", c10::to_string(_output_tiles)},
|
||||
{"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
|
||||
{"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
|
||||
{"OUTPUT_TILE_WIDTH",
|
||||
c10::to_string(_geometry.output_tile_size.x)},
|
||||
{"OUTPUT_TILE_HEIGHT",
|
||||
c10::to_string(_geometry.output_tile_size.y)},
|
||||
{"INPUT_TILE_X", c10::to_string(_geometry.input_tile_grid_size.x)},
|
||||
{"OUTPUT_TILE_X",
|
||||
c10::to_string(_geometry.output_tile_grid_size.x)},
|
||||
{"INPUT_TILE_CHUNK_SIZE", c10::to_string(_input_tile_chunk_size)},
|
||||
{"OUTPUT_TILE_CHUNK_SIZE",
|
||||
c10::to_string(_output_tile_chunk_size)},
|
||||
{"OUTPUT_TILE_BATCH_SIZE",
|
||||
c10::to_string(_output_tile_batch_size)},
|
||||
{"TILED_CONVOLUTION", c10::to_string(_tiling)},
|
||||
{"INPUT_PADDING_X",
|
||||
c10::to_string(
|
||||
_geometry.transposed
|
||||
? _geometry.kernel_size.x - 1 - _geometry.input_padding.x
|
||||
: _geometry.input_padding.x)},
|
||||
{"INPUT_PADDING_Y",
|
||||
c10::to_string(
|
||||
_geometry.transposed
|
||||
? _geometry.kernel_size.y - 1 - _geometry.input_padding.y
|
||||
: _geometry.input_padding.y)},
|
||||
{"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
|
||||
{"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
|
||||
{"TRANSPOSED_CONVOLUTION", c10::to_string(_geometry.transposed)},
|
||||
{"BOUNDS_CHECK_MODE",
|
||||
c10::to_string(bounds_check_mode(_tiling, _geometry))}}),
|
||||
kernel(_kernel),
|
||||
bias(_bias),
|
||||
prelu_scale(_prelu_scale),
|
||||
geometry(_geometry),
|
||||
prelu_scale_size(_prelu_scale_size),
|
||||
input_batch_size(_input_batch_size),
|
||||
output_batch_size(_output_batch_size),
|
||||
input_tiles(_input_tiles),
|
||||
output_tiles(_output_tiles),
|
||||
input_tile_chunk_size(_input_tile_chunk_size),
|
||||
output_tile_chunk_size(_output_tile_chunk_size),
|
||||
input_tile_batch_size(_input_tile_batch_size),
|
||||
output_tile_batch_size(_output_tile_batch_size),
|
||||
tiling(_tiling) {}
|
||||
|
||||
~GLConvolution() {}
|
||||
|
||||
template <typename T>
|
||||
void convolution(
|
||||
const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images);
|
||||
|
||||
private:
|
||||
/*
|
||||
* Computes BOUNDS_CHECK_MODE for the convolution parameters.
|
||||
*
|
||||
* @retval 0 if bounds check can be skipped
|
||||
* @retval non-zero if bounds check can not be skipped
|
||||
*/
|
||||
inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
|
||||
if (tiling) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int input_padding_x = geometry.input_padding.x,
|
||||
input_padding_y = geometry.input_padding.y;
|
||||
if (geometry.transposed) {
|
||||
input_padding_x = geometry.kernel_size.x - 1 - input_padding_x;
|
||||
input_padding_y = geometry.kernel_size.y - 1 - input_padding_y;
|
||||
}
|
||||
|
||||
if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
|
||||
(input_padding_x == 0 && input_padding_y == 0)) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<binding*> input_bindings(
|
||||
int input_batch_size,
|
||||
int output_batch_size) {
|
||||
std::vector<binding*> bindings({BINDING(outputSize),
|
||||
BINDING(accumulate),
|
||||
BINDING(fusePRelu),
|
||||
BINDING(inputTileRange)});
|
||||
|
||||
for (int i = 0; i < input_batch_size; i++) {
|
||||
bindings.push_back(
|
||||
inputData[i] = new binding{"inputData[" + c10::to_string(i) + "]"});
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_batch_size; i++) {
|
||||
bindings.push_back(
|
||||
previousData[i] =
|
||||
new binding{"previousData[" + c10::to_string(i) + "]"});
|
||||
}
|
||||
|
||||
return bindings;
|
||||
}
|
||||
|
||||
const std::vector<binding*> uniform_blocks_bindings(
|
||||
int input_batch_size,
|
||||
int output_batch_size,
|
||||
int output_tile_batch_size,
|
||||
bool fuse_prelu) {
|
||||
std::vector<binding*> bindings({BINDING(bias_block)});
|
||||
if (fuse_prelu) {
|
||||
bindings.push_back(BINDING(prelu_scale_block));
|
||||
}
|
||||
|
||||
for (int i = 0; i < std::max(input_batch_size, output_tile_batch_size);
|
||||
i++) {
|
||||
bindings.push_back(
|
||||
kernel_block[i] =
|
||||
new binding{"Kernel_block[" + c10::to_string(i) + "]"});
|
||||
}
|
||||
|
||||
return bindings;
|
||||
}
|
||||
|
||||
void pack_kernel_data_for_bached_conv(
|
||||
float16_t* data,
|
||||
size_t size,
|
||||
int input_channels,
|
||||
int output_channels,
|
||||
int is,
|
||||
int os,
|
||||
int ib);
|
||||
|
||||
void pack_kernel_data_for_tiled_conv(
|
||||
float16_t* data, // destination
|
||||
size_t size,
|
||||
int input_channels,
|
||||
int output_channels,
|
||||
point input_tile_range,
|
||||
point output_tile_range);
|
||||
|
||||
template <typename T>
|
||||
void run_batched_conv(
|
||||
const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images);
|
||||
|
||||
template <typename T>
|
||||
void run_tiled_conv(
|
||||
const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images);
|
||||
};
|
@ -1,176 +0,0 @@
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
#include "../core/DataTransfer.h"
|
||||
#include "../core/GLContext.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/GLPlainTexture.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace caffe2 {
|
||||
template <class T>
|
||||
class CopyToOpenGLOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
CopyToOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// caffe2::Timer timer;
|
||||
const TensorCPU& X = Input(0);
|
||||
const int num_images = X.dim32(0);
|
||||
const int input_channels = X.dim32(1);
|
||||
const int input_width = X.dim32(3);
|
||||
const int input_height = X.dim32(2);
|
||||
const int input_size = input_width * input_height;
|
||||
|
||||
// set up the OpenGL context
|
||||
GLContext::getGLContext()->set_context();
|
||||
|
||||
const float* input = X.template data<float>();
|
||||
|
||||
int tile_x = GetSingleArgument<int>("tile_x", 1);
|
||||
int tile_y = GetSingleArgument<int>("tile_y", 1);
|
||||
|
||||
GLImageVector<T>* output_image = ImageAllocator<T>::newImage(num_images,
|
||||
input_width,
|
||||
input_height,
|
||||
input_channels,
|
||||
tile_x,
|
||||
tile_y,
|
||||
#if CAFFE2_IOS
|
||||
true
|
||||
#else
|
||||
false
|
||||
#endif
|
||||
);
|
||||
|
||||
if (output_image->tile_x() > 1 || output_image->tile_y() > 1) {
|
||||
LOG(INFO) << "CopyToOpenGLOp tiling: " << output_image->tile_x() << ":"
|
||||
<< output_image->tile_y();
|
||||
}
|
||||
|
||||
Outputs()[0]->Reset(output_image);
|
||||
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
const auto textures = (*output_image)[i]->textures;
|
||||
for (int slice = 0; slice < textures.size(); slice++) {
|
||||
// timer.Start();
|
||||
|
||||
textures[slice]->map_load([&](void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const GLTexture::Type& type) {
|
||||
for (int y = 0; y < tile_y; y++) {
|
||||
for (int x = 0; x < tile_x; x++) {
|
||||
const int tiles = slice * tile_x * tile_y + y * tile_x + x;
|
||||
const int slice_channels = std::min(4, input_channels - 4 * tiles);
|
||||
interleaveSlice(
|
||||
(float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
|
||||
&input[i * input_channels * input_size + 4 * tiles * input_size],
|
||||
input_width,
|
||||
input_height,
|
||||
stride, // texture stride
|
||||
slice_channels);
|
||||
}
|
||||
}
|
||||
});
|
||||
// LOG(INFO) << "Texture uploading takes " << timer.MilliSeconds() << " ms";
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(CopyToOpenGL, CopyToOpenGLOp<float16_t>);
|
||||
OPERATOR_SCHEMA(CopyToOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
template <class T>
|
||||
class CopyFromOpenGLOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
CopyFromOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
caffe2::Timer timer;
|
||||
const GLImageVector<T>& X = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = X.size();
|
||||
const int input_channels = X.channels();
|
||||
const int input_width = X.width();
|
||||
const int input_height = X.height();
|
||||
|
||||
TensorCPU* Y = Output(0);
|
||||
Y->Resize(num_images, input_channels, input_height, input_width);
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
const int output_size = input_width * input_height;
|
||||
|
||||
float* output = Y->mutable_data<float>();
|
||||
|
||||
const int tile_x = X.tile_x();
|
||||
const int tile_y = X.tile_y();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
for (int slice = 0; slice < X[i]->slices; slice++) {
|
||||
timer.Start();
|
||||
const GLTexture* texture = X[i]->textures[slice];
|
||||
|
||||
texture->map_read([&](const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const GLTexture::Type& type) {
|
||||
//#if CAFFE2_ANDROID && defined(__ARM_NEON__)
|
||||
// if (static_cast<AndroidGLContext*>(GLContext::getGLContext())->get_platform() ==
|
||||
// Mali) {
|
||||
// caffe2::Timer timer;
|
||||
// timer.Start();
|
||||
// float16_t* copy_buffer = (float16_t*)malloc(_capacity);
|
||||
// arm_memcpy(
|
||||
// (volatile unsigned char*)copy_buffer, (volatile unsigned char*)buffer,
|
||||
// _capacity);
|
||||
// deInterleaveSlice(
|
||||
// output + 4 * slice * output_size, copy_buffer, width, height, stride,
|
||||
// slice_channels);
|
||||
// free(copy_buffer);
|
||||
// LOG(INFO) << "memcpy takes " << timer.MilliSeconds() << " ms";
|
||||
// } else
|
||||
//#endif
|
||||
{
|
||||
gl_log(GL_VERBOSE,
|
||||
"calling deInterleaveSlice width: %d, height: %d, stride: %d, channels: %d\n",
|
||||
width,
|
||||
height,
|
||||
stride,
|
||||
channels);
|
||||
|
||||
for (int y = 0; y < tile_y; y++) {
|
||||
for (int x = 0; x < tile_x; x++) {
|
||||
const int tiles = slice * tile_x * tile_y + y * tile_x + x;
|
||||
const int slice_channels = std::min(4, input_channels - 4 * tiles);
|
||||
deInterleaveSlice(
|
||||
output + i * input_channels * output_size + 4 * tiles * output_size,
|
||||
(float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
|
||||
input_width,
|
||||
input_height,
|
||||
stride,
|
||||
slice_channels);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(CopyFromOpenGL, CopyFromOpenGLOp<float16_t>);
|
||||
OPERATOR_SCHEMA(CopyFromOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
} // namespace caffe2
|
@ -1,462 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLReduce : public GLFilter {
|
||||
public:
|
||||
binding* inputSize;
|
||||
binding* outputSize;
|
||||
binding* tileSize;
|
||||
binding* inv_pixel_count;
|
||||
binding* epsilon;
|
||||
binding* inputData;
|
||||
binding* averageData;
|
||||
|
||||
bool compute_inv_stdev;
|
||||
bool compute_norm;
|
||||
|
||||
const std::vector<binding*> input_bindings(bool compute_norm_) {
|
||||
std::vector<binding*> bindings({BINDING(inputSize),
|
||||
BINDING(outputSize),
|
||||
BINDING(tileSize),
|
||||
BINDING(inv_pixel_count),
|
||||
BINDING(epsilon),
|
||||
BINDING(inputData)});
|
||||
if (compute_norm_) {
|
||||
bindings.push_back(BINDING(averageData));
|
||||
}
|
||||
return bindings;
|
||||
}
|
||||
|
||||
GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
|
||||
: GLFilter(
|
||||
"GLReduce",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
input_bindings(compute_norm_),
|
||||
{/* no uniform_blocks_bindings */},
|
||||
{/* no attributes */},
|
||||
{{"COMPUTE_INV_STDEV", c10::to_string((int)compute_inv_stdev_)},
|
||||
{"COMPUTE_NORM", c10::to_string((int)compute_norm_)}}),
|
||||
compute_inv_stdev(compute_inv_stdev_),
|
||||
compute_norm(compute_norm_) {}
|
||||
|
||||
template <typename T>
|
||||
void reduce(const GLImage<T>* input_image,
|
||||
const GLImage<T>* output_image,
|
||||
int tile_size_x,
|
||||
int tile_size_y,
|
||||
float inv_pixel_count_ = 1.0,
|
||||
float epsilon_ = 0.0);
|
||||
|
||||
template <typename T>
|
||||
void norm(const GLImage<T>* input_image,
|
||||
const GLImage<T>* avg_image,
|
||||
const GLImage<T>* output_image,
|
||||
int tile_size_x,
|
||||
int tile_size_y,
|
||||
float inv_pixel_count_);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLReduce::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV)
|
||||
#define COMPUTE_NORM $(COMPUTE_NORM)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 inputSize;
|
||||
uniform ivec2 outputSize;
|
||||
uniform ivec2 tileSize;
|
||||
uniform float inv_pixel_count;
|
||||
uniform float epsilon;
|
||||
|
||||
#if COMPUTE_NORM
|
||||
TEXTURE_INPUT(averageData);
|
||||
#endif
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
ivec2 texelCoord = outputCoord * tileSize;
|
||||
ivec2 sumArea = min(tileSize, inputSize - texelCoord);
|
||||
highp vec4 sum = vec4(0.0);
|
||||
|
||||
#if COMPUTE_NORM
|
||||
vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < sumArea.y; y++) {
|
||||
for (int x = 0; x < sumArea.x; x++) {
|
||||
ivec2 idx = texelCoord + ivec2(x, y);
|
||||
vec4 val = TEXTURE_LOAD(inputData, idx);
|
||||
#if COMPUTE_NORM
|
||||
val -= avg;
|
||||
sum += val * val;
|
||||
#else
|
||||
sum += val;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if COMPUTE_INV_STDEV
|
||||
outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon)));
|
||||
#elif COMPUTE_NORM
|
||||
outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count));
|
||||
#else
|
||||
outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon));
|
||||
#endif
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLReduce::reduce(const GLImage<T>* input_image,
|
||||
const GLImage<T>* output_image,
|
||||
int tile_size_x,
|
||||
int tile_size_y,
|
||||
float inv_pixel_count_,
|
||||
float epsilon_) {
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(inputSize->location, input_image->width, input_image->height);
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform2i(tileSize->location, tile_size_x, tile_size_y);
|
||||
glUniform1f(inv_pixel_count->location, inv_pixel_count_);
|
||||
glUniform1f(epsilon->location, epsilon_);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GLReduce::norm(const GLImage<T>* input_image,
|
||||
const GLImage<T>* avg_image,
|
||||
const GLImage<T>* output_image,
|
||||
int tile_size_x,
|
||||
int tile_size_y,
|
||||
float inv_pixel_count_) {
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments(
|
||||
{{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(inputSize->location, input_image->width, input_image->height);
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform2i(tileSize->location, tile_size_x, tile_size_y);
|
||||
glUniform1f(inv_pixel_count->location, inv_pixel_count_);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
|
||||
class GLScale : public GLFilter {
|
||||
public:
|
||||
binding* outputSize;
|
||||
binding* inputData;
|
||||
binding* averageData;
|
||||
binding* normData;
|
||||
|
||||
binding* scale_factor;
|
||||
binding* bias_factor;
|
||||
binding* prelu_scale_factor;
|
||||
|
||||
const int channels;
|
||||
const float* scale;
|
||||
const float* bias;
|
||||
const float* prelu_scale;
|
||||
const int prelu_size;
|
||||
|
||||
const std::vector<binding*> input_bindings(bool fuse_prelu) {
|
||||
std::vector<binding*> bindings({BINDING(outputSize),
|
||||
BINDING(scale_factor),
|
||||
BINDING(bias_factor),
|
||||
BINDING(inputData),
|
||||
BINDING(averageData),
|
||||
BINDING(normData)});
|
||||
if (fuse_prelu) {
|
||||
bindings.push_back(prelu_scale_factor = new binding({"prelu_scale_factor"}));
|
||||
}
|
||||
return bindings;
|
||||
}
|
||||
|
||||
GLScale(
|
||||
const int _channels,
|
||||
const float* _scale,
|
||||
const float* _bias,
|
||||
const float* _prelu_scale = nullptr,
|
||||
const int _prelu_size = 0)
|
||||
: GLFilter(
|
||||
"GLScale",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
input_bindings(_prelu_scale != nullptr),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"FUSE_PRELU", c10::to_string(_prelu_scale != nullptr)}}),
|
||||
channels(_channels),
|
||||
scale(_scale),
|
||||
bias(_bias),
|
||||
prelu_scale(_prelu_scale),
|
||||
prelu_size(_prelu_size) {}
|
||||
|
||||
template <typename T>
|
||||
void scale_and_shift(const GLImage<T>* input_image,
|
||||
const GLImage<T>* avg_image,
|
||||
const GLImage<T>* norm_image,
|
||||
const GLImage<T>* output_image);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLScale::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define FUSE_PRELU $(FUSE_PRELU)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
uniform ivec2 outputSize;
|
||||
uniform vec4 scale_factor;
|
||||
uniform vec4 bias_factor;
|
||||
|
||||
#if FUSE_PRELU
|
||||
uniform vec4 prelu_scale_factor;
|
||||
#endif
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_INPUT(averageData);
|
||||
TEXTURE_INPUT(normData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
|
||||
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
|
||||
vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
|
||||
vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0));
|
||||
|
||||
#if FUSE_PRELU
|
||||
vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor;
|
||||
vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0))));
|
||||
outputData = TEXTURE_STORE(o);
|
||||
#else
|
||||
vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor;
|
||||
outputData = TEXTURE_STORE(o);
|
||||
#endif
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLScale::scale_and_shift(const GLImage<T>* input_image,
|
||||
const GLImage<T>* avg_image,
|
||||
const GLImage<T>* norm_image,
|
||||
const GLImage<T>* output_image) {
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
|
||||
{avg_image->textures[is], averageData},
|
||||
{norm_image->textures[is], normData}});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform4f(scale_factor->location,
|
||||
scale[4 * is],
|
||||
channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
|
||||
channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
|
||||
channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
|
||||
glUniform4f(bias_factor->location,
|
||||
bias[4 * is],
|
||||
channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
|
||||
channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
|
||||
channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
|
||||
if (prelu_scale != nullptr) {
|
||||
glUniform4f(prelu_scale_factor->location,
|
||||
prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
|
||||
channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
|
||||
: prelu_scale[0],
|
||||
channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
|
||||
: prelu_scale[0],
|
||||
channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
|
||||
: prelu_scale[0]);
|
||||
}
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <class T, bool FUSE_PRELU>
|
||||
class OpenGLInstanceNormPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_channels = input.channels();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
const int tile_size_x = 16;
|
||||
const int tile_size_y = 16;
|
||||
int avg_buf_width = input_width;
|
||||
int avg_buf_height = input_height;
|
||||
|
||||
vector<GLImageVector<T>*> reduce_buf;
|
||||
while (reduce_buf.size() == 0 ||
|
||||
(avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
|
||||
avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
|
||||
avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
|
||||
|
||||
reduce_buf.push_back(
|
||||
ImageAllocator<T>::newImage(1, avg_buf_width, avg_buf_height, output_channels));
|
||||
}
|
||||
|
||||
GLImageVector<T>* avg = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
|
||||
GLImageVector<T>* inv_stdev = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
const float* prelu_data = nullptr;
|
||||
int prelu_size = 0;
|
||||
if (FUSE_PRELU) {
|
||||
DCHECK_EQ(InputSize(), 4);
|
||||
const auto& prelu_scale = Input(PRELU);
|
||||
prelu_data = prelu_scale.template data<float>();
|
||||
prelu_size = prelu_scale.size();
|
||||
} else {
|
||||
DCHECK_EQ(InputSize(), 3);
|
||||
}
|
||||
|
||||
const auto& scale = Input(SCALE);
|
||||
const auto& bias = Input(BIAS);
|
||||
|
||||
if (!f_reduce) {
|
||||
f_reduce.reset(new GLReduce());
|
||||
f_norm.reset(new GLReduce(false, true));
|
||||
f_stdDev.reset(new GLReduce(true, false));
|
||||
f_scale.reset(new GLScale(input_channels,
|
||||
scale.template data<float>(),
|
||||
bias.template data<float>(),
|
||||
prelu_data,
|
||||
prelu_size));
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
for (int k = 0; k < reduce_buf.size() + 1; k++) {
|
||||
const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
|
||||
GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
|
||||
|
||||
float norm = k < reduce_buf.size()
|
||||
? 1.0 / (tile_size_x * tile_size_y)
|
||||
: (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
|
||||
(float)(input_width * input_height);
|
||||
const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
|
||||
const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
|
||||
f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
|
||||
}
|
||||
|
||||
for (int k = 0; k < reduce_buf.size() + 1; k++) {
|
||||
const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
|
||||
GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
|
||||
|
||||
float norm = k < reduce_buf.size()
|
||||
? 1.0 / (tile_size_x * tile_size_y)
|
||||
: (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
|
||||
(float)(input_width * input_height);
|
||||
|
||||
if (k == 0) {
|
||||
f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
|
||||
} else if (k < reduce_buf.size()) {
|
||||
f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
|
||||
} else {
|
||||
const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
|
||||
const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
|
||||
f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
|
||||
}
|
||||
}
|
||||
|
||||
f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
|
||||
}
|
||||
Outputs()[OUTPUT]->Reset(output);
|
||||
if (OutputSize() > 1) {
|
||||
Outputs()[MEAN]->Reset(avg);
|
||||
Outputs()[INV_STDEV]->Reset(inv_stdev);
|
||||
} else {
|
||||
delete avg;
|
||||
delete inv_stdev;
|
||||
}
|
||||
for (auto&& rb : reduce_buf) {
|
||||
delete rb;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float epsilon_;
|
||||
StorageOrder order_;
|
||||
std::unique_ptr<GLReduce> f_reduce;
|
||||
std::unique_ptr<GLReduce> f_norm;
|
||||
std::unique_ptr<GLReduce> f_stdDev;
|
||||
std::unique_ptr<GLScale> f_scale;
|
||||
|
||||
INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
|
||||
OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLInstanceNorm, OpenGLInstanceNormPReluOp<float16_t, false>);
|
||||
OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
|
||||
REGISTER_CPU_OPERATOR(OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp<float16_t, true>);
|
||||
OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
|
||||
} // namespace caffe2
|
@ -1,120 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
class GLMul : public GLFilter {
|
||||
public:
|
||||
binding* outputSize;
|
||||
binding* inputData;
|
||||
binding* B;
|
||||
|
||||
GLMul()
|
||||
: GLFilter("GLMul",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize), BINDING(inputData), BINDING(B)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{/* no replacements */}) {}
|
||||
|
||||
template <typename T>
|
||||
void mul(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images, float b);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLMul::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
uniform vec4 B;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 A = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(A * B);
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLMul::mul(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
float b) {
|
||||
for (int i = 0; i < input_images.size(); i++) {
|
||||
auto input_image = input_images[i];
|
||||
auto output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform4f(B->location, b, b, b, b);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <class T>
|
||||
class OpenGLMulOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLMulOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
|
||||
"OpenGLMul only supports broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
|
||||
"OpenGLMul does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const auto& B = Input(1);
|
||||
CAFFE_ENFORCE_EQ(B.size(), 1); // only scalar is supported
|
||||
|
||||
const int num_images = input.size();
|
||||
const auto output_height = input.height();
|
||||
const auto output_width = input.width();
|
||||
const int output_channels = input.channels();
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
|
||||
if (!_mult) {
|
||||
_mult.reset(new GLMul());
|
||||
}
|
||||
|
||||
_mult->mul(input, *output, B.template data<float>()[0]);
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLMul> _mult;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLMul, OpenGLMulOp<float16_t>);
|
||||
} // namespace caffe2
|
@ -1,142 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLNormPlanarYUV : public GLFilter {
|
||||
public:
|
||||
const float* mean;
|
||||
const float* std;
|
||||
|
||||
binding* inputData;
|
||||
binding* outputSize;
|
||||
binding* mean_data;
|
||||
binding* std_data;
|
||||
|
||||
GLNormPlanarYUV(const float* _mean, const float* _std)
|
||||
: GLFilter("GLNormPlanarYUV",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData),
|
||||
BINDING(outputSize),
|
||||
BINDING(mean_data),
|
||||
BINDING(std_data)}), // input bindings
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{}),
|
||||
mean(_mean),
|
||||
std(_std) {}
|
||||
|
||||
template <typename T>
|
||||
void normalize(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLNormPlanarYUV::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
uniform vec4 mean_data;
|
||||
uniform vec4 std_data;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE((value - mean_data) / std_data);
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <class T>
|
||||
void GLNormPlanarYUV::normalize(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images) {
|
||||
int num_images = input_images.size();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* input_image = input_images[i];
|
||||
GLImage<T>* output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform4f(mean_data->location, mean[0], mean[1], mean[2], 0.0);
|
||||
glUniform4f(std_data->location, std[0], std[1], std[2], 1.0);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <typename T>
|
||||
class GLNormPlanarYUVOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
GLNormPlanarYUVOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_channels = input.channels();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
|
||||
const auto& M = Input(1); // mean
|
||||
const auto& S = Input(2); // standard deviation
|
||||
CAFFE_ENFORCE(input_channels == M.dim(1));
|
||||
CAFFE_ENFORCE(input_channels == S.dim(1));
|
||||
|
||||
if (!_normPlanarYUV) {
|
||||
_normPlanarYUV.reset(new GLNormPlanarYUV(M.template data<float>(), S.template data<float>()));
|
||||
}
|
||||
|
||||
_normPlanarYUV->normalize(input, *output);
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
StorageOrder order_;
|
||||
std::unique_ptr<GLNormPlanarYUV> _normPlanarYUV;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLNormalizePlanarYUV, GLNormPlanarYUVOp<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLNormalizePlanarYUV).NumInputs(3).NumOutputs(1);
|
||||
|
||||
} // namespace caffe2
|
@ -1,273 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLPRelu : public GLFilter {
|
||||
public:
|
||||
typedef enum { PRelu = 0, Relu = 1 } ReluType;
|
||||
|
||||
const float* scale;
|
||||
|
||||
binding* inputData;
|
||||
binding* scale_block;
|
||||
|
||||
const int scale_size;
|
||||
const int channels;
|
||||
const int output_tile_x;
|
||||
const int output_tile_y;
|
||||
const int output_tile_width;
|
||||
const int output_tile_height;
|
||||
|
||||
GLPRelu(
|
||||
const float* _scale,
|
||||
const int _scale_size,
|
||||
const int _channels,
|
||||
int _output_tile_x,
|
||||
int _output_tile_y,
|
||||
int _output_tile_width,
|
||||
int _output_tile_height)
|
||||
: GLFilter(
|
||||
"GLPRelu",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData)}),
|
||||
std::vector<binding*>({BINDING(scale_block)}),
|
||||
{/* no attributes */},
|
||||
{{"USE_RELU", c10::to_string(PRelu)},
|
||||
{"OUTPUT_TILES", c10::to_string(_output_tile_x * _output_tile_y)},
|
||||
{"OUTPUT_TILE_X", c10::to_string(_output_tile_x)},
|
||||
{"OUTPUT_TILE_WIDTH", c10::to_string(_output_tile_width)},
|
||||
{"OUTPUT_TILE_HEIGHT", c10::to_string(_output_tile_height)},
|
||||
{"TILED_PRELU",
|
||||
c10::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
|
||||
scale(_scale),
|
||||
scale_size(_scale_size),
|
||||
channels(_channels),
|
||||
output_tile_x(_output_tile_x),
|
||||
output_tile_y(_output_tile_y),
|
||||
output_tile_width(_output_tile_width),
|
||||
output_tile_height(_output_tile_height) {}
|
||||
|
||||
GLPRelu(const int _channels)
|
||||
: GLFilter(
|
||||
"GLRelu",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"USE_RELU", c10::to_string(Relu)},
|
||||
{"OUTPUT_TILES", c10::to_string(1)},
|
||||
{"OUTPUT_TILE_X", c10::to_string(1)},
|
||||
{"OUTPUT_TILE_WIDTH", c10::to_string(1)},
|
||||
{"OUTPUT_TILE_HEIGHT", c10::to_string(1)},
|
||||
{"TILED_PRELU", c10::to_string(0)}}),
|
||||
scale(nullptr),
|
||||
scale_block(nullptr),
|
||||
scale_size(0),
|
||||
channels(_channels),
|
||||
output_tile_x(1),
|
||||
output_tile_y(1),
|
||||
output_tile_width(1),
|
||||
output_tile_height(1) {}
|
||||
|
||||
template <typename T>
|
||||
void prelu(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
GLPRelu::ReluType reluType);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
|
||||
#define TILED_PRELU $(TILED_PRELU)
|
||||
#define USE_RELU $(USE_RELU)
|
||||
|
||||
// tiling
|
||||
#define OUTPUT_TILES $(OUTPUT_TILES)
|
||||
#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
|
||||
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
|
||||
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
|
||||
|
||||
// common
|
||||
precision mediump float;
|
||||
precision highp int;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
#if USE_RELU
|
||||
|
||||
// Relu
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(max(value, vec4(0.0)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#if TILED_PRELU
|
||||
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
|
||||
|
||||
layout (std140) uniform scale_block {
|
||||
highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
|
||||
};
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
|
||||
|
||||
// outputData = value > 0 ? value : value * weight;
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
|
||||
value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
#else
|
||||
layout (std140) uniform scale_block {
|
||||
highp uvec4 scale;
|
||||
};
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
|
||||
// outputData = value > 0 ? value : value * weight;
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
#endif // TILED_PRELU
|
||||
|
||||
#endif // USE_RELU
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLPRelu::prelu(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
GLPRelu::ReluType reluType) {
|
||||
int num_images = input_images.size();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* input_image = input_images[i];
|
||||
GLImage<T>* output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
if (reluType == PRelu) {
|
||||
attach_uniform_buffer<float16_t>(scale_block, 0, [&](float16_t* data, size_t size) {
|
||||
int output_tiles = output_tile_x * output_tile_y;
|
||||
for (int j = 0, k = 4 * is * output_tiles;
|
||||
k < std::min(channels, 4 * (is + 1) * output_tiles);
|
||||
j++, k++) {
|
||||
data[j] = scale_size == channels ? scale[k] : scale[0];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<texture_attachment> input_attachments;
|
||||
|
||||
input_attachments.push_back({input_image->textures[is], inputData});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {},
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <typename T, GLPRelu::ReluType reluType>
|
||||
class OpenGLPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLPReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_channels = input.channels();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
|
||||
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
|
||||
if (input_tile_x > 1 || input_tile_y > 1) {
|
||||
CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
|
||||
}
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(num_images,
|
||||
output_width,
|
||||
output_height,
|
||||
output_channels,
|
||||
output_tile_x,
|
||||
output_tile_y,
|
||||
is_last);
|
||||
|
||||
const auto* scale = reluType == GLPRelu::PRelu ? &Input(1) : nullptr;
|
||||
|
||||
if (!_prelu) {
|
||||
if (reluType == GLPRelu::PRelu) {
|
||||
_prelu.reset(new GLPRelu(scale->template data<float>(),
|
||||
scale->size(),
|
||||
input_channels,
|
||||
output_tile_x,
|
||||
output_tile_y,
|
||||
output_width,
|
||||
output_height));
|
||||
} else {
|
||||
_prelu.reset(new GLPRelu(input_channels));
|
||||
}
|
||||
}
|
||||
|
||||
_prelu->prelu(input, *output, reluType);
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
StorageOrder order_;
|
||||
std::unique_ptr<GLPRelu> _prelu;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLPRelu, OpenGLPReluOp<float16_t, GLPRelu::PRelu>);
|
||||
OPERATOR_SCHEMA(OpenGLPRelu)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.IdenticalTypeAndShape();
|
||||
REGISTER_CPU_OPERATOR(OpenGLRelu, OpenGLPReluOp<float16_t, GLPRelu::Relu>);
|
||||
OPERATOR_SCHEMA(OpenGLRelu)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.IdenticalTypeAndShape();
|
||||
} // namespace caffe2
|
@ -1,159 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
|
||||
class GLPadImage : public GLFilter {
|
||||
public:
|
||||
binding* padSize;
|
||||
binding* inputSize;
|
||||
binding* outputSize;
|
||||
binding* inputData;
|
||||
|
||||
GLPadImage()
|
||||
: GLFilter(
|
||||
"GLPadImage",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>(
|
||||
{BINDING(padSize), BINDING(inputSize), BINDING(outputSize), BINDING(inputData)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{/* no replacements */}) {}
|
||||
|
||||
template <typename T>
|
||||
void pad(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
const int pad_l,
|
||||
const int pad_t);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLPadImage::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 padSize;
|
||||
uniform ivec2 inputSize;
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)) - padSize;
|
||||
texelCoord = max(texelCoord, -texelCoord);
|
||||
texelCoord = min(texelCoord, ivec2(2) * (inputSize - 1) - texelCoord);
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLPadImage::pad(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
const int pad_l,
|
||||
const int pad_t) {
|
||||
for (int i = 0; i < input_images.size(); i++) {
|
||||
auto input_image = input_images[i];
|
||||
auto output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(inputSize->location, input_image->width, input_image->height);
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform2i(padSize->location, pad_l, pad_t);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename OPBase>
|
||||
static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
|
||||
Tensor<CPUContext> input, output;
|
||||
input.Resize(1, 1, H, W);
|
||||
op->SetOutputSize(input, &output, 1);
|
||||
CAFFE_ENFORCE_EQ(output.ndim(), 4);
|
||||
*OH = output.dim(2);
|
||||
*OW = output.dim(3);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLPadImageOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<CPUContext>(operator_def, ws),
|
||||
mode_(OperatorBase::GetSingleArgument<string>("mode", "")) {
|
||||
OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
|
||||
OPERATOR_NEEDS_FEATURE(mode_ == "reflect", "OpenGL only supports reflection");
|
||||
|
||||
CAFFE_ENFORCE(legacy_pad_ == LegacyPadding::NOTSET,
|
||||
"Padding layer only supports explicit pad values.");
|
||||
CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
|
||||
"Pooling op does not support dilation right now.");
|
||||
CAFFE_ENFORCE(stride_h() == 1 && stride_w() == 1,
|
||||
"Pooling op does not support stride right now.");
|
||||
// Pad op does not use kernel sizes, so we set it to 1 for computing the
|
||||
// output size.
|
||||
kernel_.assign(pads_.size() / 2, 1);
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
|
||||
const int num_images = input.size();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
const int input_channels = input.channels();
|
||||
const int output_channels = input_channels;
|
||||
|
||||
int output_height, output_width;
|
||||
computeOutputHW(this, input_height, input_width, &output_height, &output_width);
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
|
||||
if (!padImage_) {
|
||||
padImage_.reset(new GLPadImage());
|
||||
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
|
||||
<< output_channels << ": " << output_height << " X " << output_width;
|
||||
LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
|
||||
<< ", pad_b = " << pad_b();
|
||||
}
|
||||
|
||||
padImage_->pad(input, *output, pad_l(), pad_t());
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string mode_;
|
||||
std::unique_ptr<GLPadImage> padImage_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLPadImage, OpenGLPadImageOp<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLPadImage).NumInputs(1).NumOutputs(1);
|
||||
} // namespace caffe2
|
@ -1,339 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/operators/pool_op.h"
|
||||
|
||||
class GLPool : public GLFilter {
|
||||
public:
|
||||
typedef enum { AveragePool, MaxPool } PoolType;
|
||||
|
||||
struct point {
|
||||
int x;
|
||||
int y;
|
||||
};
|
||||
|
||||
struct descriptor {
|
||||
int channels;
|
||||
point kernel_size;
|
||||
point input_padding;
|
||||
point input_stride;
|
||||
point input_tile_size;
|
||||
point output_tile_size;
|
||||
};
|
||||
|
||||
binding* inputData;
|
||||
binding* kernelSize;
|
||||
binding* outputSize;
|
||||
|
||||
const descriptor geometry;
|
||||
|
||||
GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
|
||||
: GLFilter(
|
||||
"GLPool",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
{
|
||||
BINDING(inputData),
|
||||
BINDING(kernelSize),
|
||||
BINDING(outputSize),
|
||||
},
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
|
||||
{"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
|
||||
{"INPUT_PADDING_X", c10::to_string(_geometry.input_padding.x)},
|
||||
{"INPUT_PADDING_Y", c10::to_string(_geometry.input_padding.y)},
|
||||
{"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
|
||||
{"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
|
||||
{"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
|
||||
{"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
|
||||
{"OUTPUT_TILE_WIDTH",
|
||||
c10::to_string(_geometry.output_tile_size.x)},
|
||||
{"OUTPUT_TILE_HEIGHT",
|
||||
c10::to_string(_geometry.output_tile_size.y)},
|
||||
{"TILED_POOLING", c10::to_string(_tiling)},
|
||||
{"MAX_POOL", c10::to_string(poolType == MaxPool)},
|
||||
{"BOUNDS_CHECK_MODE", c10::to_string(1)}}),
|
||||
geometry(_geometry) {}
|
||||
~GLPool() {}
|
||||
|
||||
void pool(const GLImageVector<float16_t>& input_images,
|
||||
const GLImageVector<float16_t>& output_images) {
|
||||
for (int i = 0; i < input_images.size(); i++) {
|
||||
auto input_image = input_images[i];
|
||||
auto output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
run({{input_image->textures[is], inputData}},
|
||||
{output_image->textures[is]},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
|
||||
glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
|
||||
},
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/*
|
||||
* Computes BOUNDS_CHECK_MODE for the convolution parameters.
|
||||
*
|
||||
* @retval 0 if bounds check can be skipped
|
||||
* @retval non-zero if bounds check can not be skipped
|
||||
*/
|
||||
inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
|
||||
if (tiling) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
|
||||
(geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
const char* GLPool::fragment_shader = R"GLSL(#version 300 es
|
||||
#define TILED_POOLING $(TILED_POOLING)
|
||||
#define MAX_POOL $(MAX_POOL)
|
||||
|
||||
// tiling
|
||||
#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
|
||||
#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
|
||||
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
|
||||
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
|
||||
|
||||
#define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
|
||||
const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
|
||||
const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
|
||||
|
||||
uniform ivec2 kernelSize;
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
#if BOUNDS_CHECK_MODE == 0
|
||||
#define IN_BOUNDS(p, p0, p1) (true)
|
||||
#else
|
||||
#define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
|
||||
#endif
|
||||
|
||||
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
|
||||
const float MIN_FLOAT = -exp2(14.0);
|
||||
|
||||
#if TILED_POOLING
|
||||
|
||||
const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
|
||||
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
|
||||
|
||||
// tiled pooling
|
||||
#if MAX_POOL
|
||||
|
||||
#define POOL { \
|
||||
pool = vec4(MIN_FLOAT); \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = tileCoord + ivec2(x, y); \
|
||||
if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
|
||||
pool = max(pool, data); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define POOL { \
|
||||
int count = 0; \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = tileCoord + ivec2(x, y); \
|
||||
if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
|
||||
pool += data;\
|
||||
count += 1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
pool = pool / float(count); \
|
||||
}
|
||||
|
||||
#endif // MAX_POOL
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
|
||||
tileCoord = input_stride * tileCoord - input_padding;
|
||||
|
||||
ivec2 inputTileOffset = tile * inputTileSize;
|
||||
|
||||
#if MAX_POOL
|
||||
vec4 pool = vec4(0);
|
||||
#else
|
||||
highp vec4 pool = vec4(0);
|
||||
#endif
|
||||
|
||||
POOL;
|
||||
|
||||
outputData = TEXTURE_STORE(pool);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// no tiling
|
||||
#if MAX_POOL
|
||||
|
||||
#define POOL { \
|
||||
pool = vec4(MIN_FLOAT); \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = texelCoord + ivec2(x, y); \
|
||||
if IN_BOUNDS(idx, ivec2(0), inputSize) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, idx); \
|
||||
pool = max(pool, data); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define POOL { \
|
||||
int count = 0; \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = texelCoord + ivec2(x, y); \
|
||||
if IN_BOUNDS(idx, ivec2(0), inputSize) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, idx); \
|
||||
pool += data; \
|
||||
count += 1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
pool = pool / float(count); \
|
||||
}
|
||||
|
||||
#endif // MAX_POOL
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
|
||||
#if MAX_POOL
|
||||
vec4 pool = vec4(0);
|
||||
#else
|
||||
highp vec4 pool = vec4(0);
|
||||
#endif
|
||||
|
||||
POOL;
|
||||
|
||||
outputData = TEXTURE_STORE(pool);
|
||||
}
|
||||
#endif // TILED_POOLING
|
||||
|
||||
)GLSL";
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename OPBase>
|
||||
static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
|
||||
Tensor<CPUContext> input, output;
|
||||
input.Resize(1, 1, H, W);
|
||||
op->SetOutputSize(input, &output, 1);
|
||||
CAFFE_ENFORCE_EQ(output.ndim(), 4);
|
||||
*OH = output.dim(2);
|
||||
*OW = output.dim(3);
|
||||
}
|
||||
|
||||
template <typename T, GLPool::PoolType poolType>
|
||||
class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
|
||||
public:
|
||||
GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
|
||||
CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
|
||||
"Pooling op does not support dilation right now.");
|
||||
if (!global_pooling_) {
|
||||
CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
|
||||
pad_r() < kernel_w(),
|
||||
"Pad should be smaller than kernel.");
|
||||
}
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_channels = input.channels();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
|
||||
int output_height;
|
||||
int output_width;
|
||||
const int output_channels = input_channels;
|
||||
|
||||
computeOutputHW(this, input_height, input_width, &output_height, &output_width);
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
|
||||
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
|
||||
GLPool::descriptor geometry{input_channels,
|
||||
{kernel_w(), kernel_h()},
|
||||
{pad_l(), pad_t()},
|
||||
{stride_w(), stride_h()},
|
||||
{input_width, input_height},
|
||||
{output_height, output_width}};
|
||||
|
||||
if (!glPool_) {
|
||||
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
|
||||
<< output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
|
||||
<< " Tiling: " << input_tile_x << "X" << input_tile_y;
|
||||
|
||||
glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
|
||||
}
|
||||
|
||||
glPool_->pool(input, *output);
|
||||
|
||||
OperatorBase::Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLPool> glPool_;
|
||||
};
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
|
||||
REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
|
||||
OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
|
||||
OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
|
||||
}; // namespace
|
||||
}; // namespace caffe2
|
@ -1,135 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLResizeNearest : public GLFilter {
|
||||
public:
|
||||
binding* inputData;
|
||||
binding* outputSize;
|
||||
binding* scale_reverse;
|
||||
|
||||
GLResizeNearest()
|
||||
: GLFilter("GLResizeNearest",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
|
||||
{/* no uniform blocks*/},
|
||||
{/* no attributes */},
|
||||
{/* replacements */}) {}
|
||||
|
||||
template <typename T>
|
||||
void resize(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
float width_scale_rev,
|
||||
float height_scale_rev);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
uniform highp vec2 scale_reverse;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
// it clamps to the edge by default
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLResizeNearest::resize(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
float width_scale_rev,
|
||||
float height_scale_rev) {
|
||||
for (int i = 0; i < input_images.size(); i++) {
|
||||
auto input_image = input_images[i];
|
||||
auto output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
|
||||
glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
|
||||
},
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <class T>
|
||||
class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
|
||||
if (HasArgument("width_scale")) {
|
||||
width_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("width_scale", 1));
|
||||
}
|
||||
if (HasArgument("height_scale")) {
|
||||
height_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("height_scale", 1));
|
||||
}
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
const int input_channels = input.channels();
|
||||
|
||||
const int output_width = input_width * width_scale_;
|
||||
const int output_height = input_height * height_scale_;
|
||||
const int output_channels = input_channels;
|
||||
|
||||
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
|
||||
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
|
||||
if (!resizeNearest_) {
|
||||
resizeNearest_.reset(new GLResizeNearest());
|
||||
}
|
||||
resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
float width_scale_;
|
||||
float height_scale_;
|
||||
std::unique_ptr<GLResizeNearest> resizeNearest_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLResizeNearest, OpenGLResizeNearestOp<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLResizeNearest).NumInputs(1).NumOutputs(1);
|
||||
|
||||
} // namespace caffe2
|
@ -1,135 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
typedef enum { Sigmoid, Tanh } OpType;
|
||||
|
||||
class GLSigmoid : public GLFilter {
|
||||
public:
|
||||
binding* inputData;
|
||||
binding* outputSize;
|
||||
|
||||
GLSigmoid(OpType opType)
|
||||
: GLFilter(
|
||||
"GLSigmoid",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
{BINDING(outputSize), BINDING(inputData)},
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"SIGMOID", c10::to_string(opType == Sigmoid)},
|
||||
{"TANH", c10::to_string(opType == Tanh)}}) {}
|
||||
|
||||
template <typename T>
|
||||
void sigmoid(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLSigmoid::fragment_shader = R"GLSL(#version 300 es
|
||||
#define SIGMOID $(SIGMOID)
|
||||
#define TANH $(TANH)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 value = TEXTURE_LOAD(inputData, ivec2(texelCoord));
|
||||
#if SIGMOID
|
||||
value = vec4(1.0) / (vec4(1.0) + exp(-value));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
#elif TANH
|
||||
value = tanh(value);
|
||||
outputData = TEXTURE_STORE(value);
|
||||
#endif
|
||||
}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLSigmoid::sigmoid(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images) {
|
||||
for (int i = 0; i < input_images.size(); i++) {
|
||||
auto input_image = input_images[i];
|
||||
auto output_image = output_images[i];
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <typename T, OpType opType>
|
||||
class OpenGLSigmoidOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_channels = input.channels();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
|
||||
if (!_sigmoid) {
|
||||
_sigmoid.reset(new GLSigmoid(opType));
|
||||
}
|
||||
|
||||
_sigmoid->sigmoid(input, *output);
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLSigmoid> _sigmoid;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLSigmoid, OpenGLSigmoidOp<float16_t, Sigmoid>);
|
||||
OPERATOR_SCHEMA(OpenGLSigmoid)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.IdenticalTypeAndShape();
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLTanh, OpenGLSigmoidOp<float16_t, Tanh>);
|
||||
OPERATOR_SCHEMA(OpenGLTanh)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.IdenticalTypeAndShape();
|
||||
} // namespace caffe2
|
@ -1,434 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLSoftmaxReduce : public GLFilter {
|
||||
public:
|
||||
binding* inputTileSize;
|
||||
binding* outputSize;
|
||||
binding* outputTileSize;
|
||||
binding* tileSize;
|
||||
binding* spatialTileSize;
|
||||
binding* inputTileRange;
|
||||
binding* inputData;
|
||||
binding* maxData;
|
||||
binding* sumData;
|
||||
|
||||
const std::vector<binding*> input_bindings() {
|
||||
std::vector<binding*> bindings({BINDING(inputTileSize),
|
||||
BINDING(outputSize),
|
||||
BINDING(outputTileSize),
|
||||
BINDING(tileSize),
|
||||
BINDING(spatialTileSize),
|
||||
BINDING(inputTileRange),
|
||||
BINDING(inputData),
|
||||
BINDING(maxData),
|
||||
BINDING(sumData)});
|
||||
return bindings;
|
||||
}
|
||||
|
||||
GLSoftmaxReduce(
|
||||
bool compute_sum_ = false,
|
||||
bool tiled = false,
|
||||
int input_tile_x = 1)
|
||||
: GLFilter(
|
||||
"GLSoftmaxReduce",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
input_bindings(),
|
||||
{/* no uniform_blocks_bindings */},
|
||||
{/* no attributes */},
|
||||
{{"COMPUTE_SUM", c10::to_string((int)compute_sum_)},
|
||||
{"INPUT_TILE_X", c10::to_string(input_tile_x)},
|
||||
{"TILED_SOFTMAX", c10::to_string(int(tiled))}}) {}
|
||||
|
||||
template <typename T>
|
||||
void reduce(const GLImage<T>* input_image,
|
||||
const GLImage<T>* output_image,
|
||||
int tile_size_x,
|
||||
int tile_size_y);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLSoftmaxReduce::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define TILED_SOFTMAX $(TILED_SOFTMAX)
|
||||
#define INPUT_TILE_X $(INPUT_TILE_X)
|
||||
// Compute sum or max
|
||||
#define COMPUTE_SUM $(COMPUTE_SUM)
|
||||
|
||||
precision highp float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 inputTileSize;
|
||||
uniform ivec2 outputSize;
|
||||
uniform ivec2 outputTileSize;
|
||||
uniform ivec2 spatialTileSize;
|
||||
uniform ivec2 tileSize;
|
||||
uniform ivec2 inputTileRange;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
#if TILED_SOFTMAX
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
|
||||
ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize);
|
||||
|
||||
vec4 result = vec4(0.0);
|
||||
for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) {
|
||||
int inTileX = tileIdx % INPUT_TILE_X;
|
||||
int inTileY = tileIdx / INPUT_TILE_X;
|
||||
ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
|
||||
for (int y = 0; y < sumArea.y; y++) {
|
||||
for (int x = 0; x < sumArea.x; x++) {
|
||||
ivec2 idx = tileCoord + ivec2(x, y);
|
||||
vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx);
|
||||
#if COMPUTE_SUM
|
||||
result += val;
|
||||
#else
|
||||
result = max(result, val);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
outputData = TEXTURE_STORE(result);
|
||||
}
|
||||
#else
|
||||
void main() {
|
||||
ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize));
|
||||
ivec2 texelCoord = outputCoord * spatialTileSize;
|
||||
ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord);
|
||||
vec4 result = vec4(0.0);
|
||||
|
||||
for (int y = 0; y < sumArea.y; y++) {
|
||||
for (int x = 0; x < sumArea.x; x++) {
|
||||
ivec2 idx = texelCoord + ivec2(x, y);
|
||||
vec4 val = TEXTURE_LOAD(inputData, idx);
|
||||
#if COMPUTE_SUM
|
||||
result += val;
|
||||
#else
|
||||
result = max(result, val);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
outputData = TEXTURE_STORE(result);
|
||||
}
|
||||
#endif
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLSoftmaxReduce::reduce(const GLImage<T>* input_image,
|
||||
const GLImage<T>* output_image,
|
||||
int tile_size_x,
|
||||
int tile_size_y) {
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is,
|
||||
output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(
|
||||
inputTileSize->location, input_image->width, input_image->height);
|
||||
glUniform2i(
|
||||
outputSize->location,
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
glUniform2i(
|
||||
outputTileSize->location,
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
glUniform2i(
|
||||
tileSize->location, input_image->tile_x, input_image->tile_y);
|
||||
glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
|
||||
glUniform2i(
|
||||
inputTileRange->location,
|
||||
0,
|
||||
std::min(
|
||||
(input_image->channels + 3) / 4,
|
||||
input_image->tile_x * input_image->tile_y));
|
||||
},
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
|
||||
class GLSoftmaxScale : public GLFilter {
|
||||
public:
|
||||
binding* outputSize;
|
||||
binding* inputData;
|
||||
binding* maxData;
|
||||
binding* sumData;
|
||||
|
||||
const std::vector<binding*> input_bindings() {
|
||||
std::vector<binding*> bindings(
|
||||
{BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
|
||||
return bindings;
|
||||
}
|
||||
|
||||
GLSoftmaxScale(bool _compute_exp = false, bool tiled = false)
|
||||
: GLFilter(
|
||||
"GLSoftmaxScale",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
input_bindings(),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"COMPUTE_EXP", c10::to_string((int)_compute_exp)},
|
||||
{"TILED_SOFTMAX", c10::to_string((int)tiled)}}) {}
|
||||
|
||||
template <typename T>
|
||||
void scale(const GLImage<T>* input_image,
|
||||
const GLImage<T>* max_image,
|
||||
const GLImage<T>* sum_image,
|
||||
const GLImage<T>* output_image);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void GLSoftmaxScale::scale(const GLImage<T>* input_image,
|
||||
const GLImage<T>* max_image,
|
||||
const GLImage<T>* sum_image,
|
||||
const GLImage<T>* output_image) {
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
|
||||
{max_image->textures[is], maxData},
|
||||
{sum_image->textures[is], sumData}});
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is,
|
||||
output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(
|
||||
outputSize->location,
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
},
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLSoftmaxScale::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define COMPUTE_EXP $(COMPUTE_EXP)
|
||||
#define TILED_SOFTMAX $(TILED_SOFTMAX)
|
||||
|
||||
precision highp float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_INPUT(maxData);
|
||||
TEXTURE_INPUT(sumData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
|
||||
#if COMPUTE_EXP
|
||||
vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0));
|
||||
#if TILED_SOFTMAX
|
||||
float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w);
|
||||
maxVal = vec4(singleMax, singleMax, singleMax, singleMax);
|
||||
outputData = TEXTURE_STORE(exp(val - maxVal));
|
||||
#else
|
||||
outputData = TEXTURE_STORE(exp(val - maxVal));
|
||||
#endif
|
||||
|
||||
#else
|
||||
vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0));
|
||||
#if TILED_SOFTMAX
|
||||
float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w;
|
||||
sumVal = vec4(singleSum, singleSum, singleSum, singleSum);
|
||||
outputData = TEXTURE_STORE(val / sumVal);
|
||||
#else
|
||||
outputData = TEXTURE_STORE(val / sumVal);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
)GLSL";
|
||||
|
||||
#include "../core/ImageAllocator.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
namespace caffe2 {
|
||||
template <class T>
|
||||
class OpenGLSoftmax final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLSoftmax(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
|
||||
const int num_images = input.size();
|
||||
const int input_channels = input.channels();
|
||||
const int input_width = input.width();
|
||||
const int input_height = input.height();
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
// For tiling
|
||||
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
|
||||
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
|
||||
const bool tiled = input_tile_x > 1 || input_tile_y > 1;
|
||||
if (tiled) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input.slices(), 1, "Input needs to be tiled in a single texture");
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE(
|
||||
tiled || input_channels == 1,
|
||||
"Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
|
||||
|
||||
// for spatial dimension
|
||||
const int tile_size_x = 16;
|
||||
const int tile_size_y = 16;
|
||||
|
||||
int max_buf_width = input_width;
|
||||
int max_buf_height = input_height;
|
||||
int max_buf_channels = input_channels;
|
||||
vector<GLImageVector<T>*> reduce_buf;
|
||||
|
||||
while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
|
||||
max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
|
||||
max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
|
||||
if (tiled) {
|
||||
// since we are summing over all the channels within a channel tile
|
||||
max_buf_channels =
|
||||
(max_buf_channels + input_tile_x * input_tile_y - 1) /
|
||||
(input_tile_x + input_tile_y);
|
||||
}
|
||||
reduce_buf.push_back(ImageAllocator<T>::newImage(
|
||||
1,
|
||||
max_buf_width,
|
||||
max_buf_height,
|
||||
max_buf_channels,
|
||||
output_tile_x,
|
||||
output_tile_y));
|
||||
}
|
||||
|
||||
GLImageVector<T>* max = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
|
||||
GLImageVector<T>* sum = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
|
||||
GLImageVector<T>* after_exp = ImageAllocator<T>::newImage(
|
||||
num_images,
|
||||
output_width,
|
||||
output_height,
|
||||
output_channels,
|
||||
output_tile_x,
|
||||
output_tile_y);
|
||||
GLImageVector<T>* output_images = ImageAllocator<T>::newImage(
|
||||
num_images,
|
||||
output_width,
|
||||
output_height,
|
||||
output_channels,
|
||||
output_tile_x,
|
||||
output_tile_y,
|
||||
is_last);
|
||||
|
||||
if (!f_max) {
|
||||
f_max.reset(new GLSoftmaxReduce(false, tiled, input_tile_x));
|
||||
f_exp.reset(new GLSoftmaxScale(true, tiled));
|
||||
f_sum.reset(new GLSoftmaxReduce(true, tiled, input_tile_x));
|
||||
f_scale.reset(new GLSoftmaxScale(false, tiled));
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
auto input_image = input[i];
|
||||
auto max_image = (*max)[i];
|
||||
auto sum_image = (*sum)[i];
|
||||
auto after_exp_image = (*after_exp)[i];
|
||||
auto output_image = (*output_images)[i];
|
||||
// Get Max
|
||||
for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
|
||||
const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
|
||||
GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
|
||||
|
||||
const int running_tile_size_x =
|
||||
ir < reduce_buf.size() ? tile_size_x : in->width;
|
||||
const int running_tile_size_y =
|
||||
ir < reduce_buf.size() ? tile_size_y : in->height;
|
||||
f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
|
||||
}
|
||||
// scale vals by exp(x - max)
|
||||
f_exp->scale(input_image, max_image, sum_image, after_exp_image);
|
||||
|
||||
// Get sum of the exp
|
||||
for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
|
||||
const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
|
||||
GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
|
||||
const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
|
||||
const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
|
||||
f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
|
||||
}
|
||||
|
||||
// Scale(softmax)
|
||||
f_scale->scale(after_exp_image, max_image, sum_image, output_image);
|
||||
}
|
||||
|
||||
Outputs()[OUTPUT]->Reset(output_images);
|
||||
|
||||
delete sum;
|
||||
delete max;
|
||||
delete after_exp;
|
||||
for (auto&& rb : reduce_buf) {
|
||||
delete rb;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
StorageOrder order_;
|
||||
std::unique_ptr<GLSoftmaxReduce> f_max;
|
||||
std::unique_ptr<GLSoftmaxScale> f_exp;
|
||||
std::unique_ptr<GLSoftmaxReduce> f_sum;
|
||||
std::unique_ptr<GLSoftmaxScale> f_scale;
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLSoftmax)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.IdenticalTypeAndShape();
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_MOBILE
|
@ -1,397 +0,0 @@
|
||||
|
||||
#include "../core/GLContext.h"
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
enum InputFormat { BGRA = 0, RGBA = 1 };
|
||||
|
||||
class GLStylizer : public GLFilter {
|
||||
binding* inputData;
|
||||
binding* outputSize;
|
||||
binding* mean;
|
||||
binding* noise_std;
|
||||
bool deprocess;
|
||||
|
||||
public:
|
||||
GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
|
||||
: GLFilter(
|
||||
_deprocess ? "GLDeStylizer" : "GLStylizer",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData),
|
||||
BINDING(mean),
|
||||
BINDING(noise_std),
|
||||
BINDING(outputSize)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"DEPROCESS", c10::to_string(_deprocess)},
|
||||
{"RGBAINPUT", c10::to_string(input_format)}}),
|
||||
deprocess(_deprocess) {}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void stylize(const GLImage<T1>* input_image,
|
||||
const GLImage<T2>* output_image,
|
||||
const float mean_values[3],
|
||||
float noise_std_value);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLStylizer::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define DEPROCESS $(DEPROCESS)
|
||||
#define RGBAINPUT $(RGBAINPUT)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
precision mediump sampler2D;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
uniform vec3 mean;
|
||||
uniform float noise_std;
|
||||
|
||||
#if DEPROCESS
|
||||
TEXTURE_INPUT(inputData);
|
||||
layout(location = 0) out mediump vec4 outputData;
|
||||
#else
|
||||
uniform sampler2D inputData;
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
#endif
|
||||
|
||||
#if !DEPROCESS
|
||||
// http://byteblacksmith.com/improvements-to-the-canonical-one-liner-glsl-rand-for-opengl-es-2-0/
|
||||
|
||||
highp float rand(vec2 co) {
|
||||
highp float a = 12.9898;
|
||||
highp float b = 78.233;
|
||||
highp float c = 43758.5453;
|
||||
highp float dt = dot(co.xy, vec2(a, b));
|
||||
highp float sn = mod(dt, 3.14);
|
||||
return fract(sin(sn) * c);
|
||||
}
|
||||
#endif
|
||||
|
||||
// In AR Engine, input/output a RBGA texture; otherwise, BGRA tensor => texture
|
||||
#if RGBAINPUT
|
||||
void main() {
|
||||
#if DEPROCESS
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = vec4((val.rgb + mean) / 255.0, 1.0).bgra;
|
||||
#else
|
||||
outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).bgr - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
void main() {
|
||||
#if DEPROCESS
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 val = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = vec4((val.rgb + mean) / 255.0, 1.0);
|
||||
#else
|
||||
outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).rgb - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
)GLSL";
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void GLStylizer::stylize(const GLImage<T1>* input_image,
|
||||
const GLImage<T2>* output_image,
|
||||
const float mean_values[3],
|
||||
float noise_std_value) {
|
||||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
|
||||
{output_image->textures[0]},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
|
||||
if (!deprocess) {
|
||||
glUniform1f(noise_std->location, noise_std_value);
|
||||
}
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
class OpenGLTensorToTextureStylizerPreprocessOp : public Operator<CPUContext>,
|
||||
ImageAllocator<uint8_t>,
|
||||
ImageAllocator<float16_t> {
|
||||
public:
|
||||
// Expect this many channels as input
|
||||
static constexpr int kInputChannels = 4;
|
||||
|
||||
// Expect this many channels as output
|
||||
static constexpr int kOutputChannels = 3;
|
||||
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
|
||||
OpenGLTensorToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() {
|
||||
const auto& input = Input(0);
|
||||
const auto& mean = Input(1);
|
||||
|
||||
CAFFE_ENFORCE(input.ndim() == 4);
|
||||
|
||||
const int num_images = input.dim32(0);
|
||||
const int input_height = input.dim32(1);
|
||||
const int input_width = input.dim32(2);
|
||||
const int input_channels = input.dim32(3);
|
||||
|
||||
CAFFE_ENFORCE(input.dim32(0) == 1); // N == 1
|
||||
CAFFE_ENFORCE(input_channels == kInputChannels);
|
||||
CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
|
||||
|
||||
// get the buffers from input tensors
|
||||
const float* mean_buffer = mean.template data<float>();
|
||||
const uint8_t* input_buffer = input.template data<uint8_t>();
|
||||
|
||||
// set up the OpenGL context
|
||||
GLContext::getGLContext()->set_context();
|
||||
|
||||
GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(num_images,
|
||||
input_width,
|
||||
input_height,
|
||||
kOutputChannels,
|
||||
#if CAFFE2_IOS
|
||||
true
|
||||
#else
|
||||
false
|
||||
#endif
|
||||
);
|
||||
const int tile_x = 1, tile_y = 1;
|
||||
GLImageVector<uint8_t>* input_images = ImageAllocator<uint8_t>::newImage(
|
||||
num_images, input_width, input_height, kInputChannels, tile_x, tile_y, false);
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
auto input_image = (*input_images)[i];
|
||||
auto output_image = (*output_images)[i];
|
||||
const GLTexture* inputTexture = input_image->textures[0];
|
||||
inputTexture->loadData(input_buffer);
|
||||
|
||||
if (!glStylizer_) {
|
||||
glStylizer_.reset(new GLStylizer());
|
||||
}
|
||||
|
||||
glStylizer_->stylize(
|
||||
input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
|
||||
}
|
||||
delete input_images;
|
||||
Outputs()[0]->Reset(output_images);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLStylizer> glStylizer_;
|
||||
};
|
||||
|
||||
template <InputFormat inputFormat>
|
||||
class OpenGLTextureToTextureStylizerPreprocessOp : public Operator<CPUContext>,
|
||||
ImageAllocator<uint8_t>,
|
||||
ImageAllocator<float16_t> {
|
||||
public:
|
||||
// Expect this many channels as input
|
||||
static constexpr int kInputChannels = 4;
|
||||
|
||||
// Expect this many channels as output
|
||||
static constexpr int kOutputChannels = 3;
|
||||
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
|
||||
OpenGLTextureToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() {
|
||||
const GLImageVector<uint8_t>& input = Inputs()[0]->template Get<GLImageVector<uint8_t>>();
|
||||
const auto& mean = Input(1);
|
||||
|
||||
const int num_images = input.size();
|
||||
const int input_height = input.height();
|
||||
const int input_width = input.width();
|
||||
const int input_channels = input.channels();
|
||||
|
||||
CAFFE_ENFORCE_GT(num_images, 0);
|
||||
CAFFE_ENFORCE(input[0]->slices == 1); // N == 1
|
||||
CAFFE_ENFORCE(input_channels == kInputChannels);
|
||||
CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
|
||||
|
||||
// get the buffers from input tensors
|
||||
const float* mean_buffer = mean.template data<float>();
|
||||
|
||||
GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(
|
||||
num_images, input_width, input_height, kOutputChannels, false);
|
||||
|
||||
if (!glStylizer_) {
|
||||
glStylizer_.reset(new GLStylizer(false, inputFormat));
|
||||
}
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
auto input_image = input[i];
|
||||
auto output_image = (*output_images)[i];
|
||||
glStylizer_->stylize(
|
||||
input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
|
||||
}
|
||||
Outputs()[0]->Reset(output_images);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLStylizer> glStylizer_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLTensorToTextureStylizerPreprocess,
|
||||
OpenGLTensorToTextureStylizerPreprocessOp);
|
||||
OPERATOR_SCHEMA(OpenGLTensorToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerPreprocess,
|
||||
OpenGLTextureToTextureStylizerPreprocessOp<RGBA>);
|
||||
OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
|
||||
|
||||
class OpenGLTextureToTensorStylizerDeprocessOp : public Operator<CPUContext>,
|
||||
ImageAllocator<uint8_t> {
|
||||
public:
|
||||
using Operator<CPUContext>::Operator;
|
||||
|
||||
// Expect this many channels as input
|
||||
static constexpr int kInputChannels = 3;
|
||||
|
||||
// Expect this many channels as output
|
||||
static constexpr int kOutputChannels = 4;
|
||||
|
||||
bool RunOnDevice() {
|
||||
const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
|
||||
const auto& mean = Input(1);
|
||||
auto* output = Output(0);
|
||||
|
||||
const int num_images = input.size(), channels = input.channels(), height = input.height(),
|
||||
width = input.width();
|
||||
// Assume BGR or BGRA
|
||||
CAFFE_ENFORCE(mean.size() == kInputChannels);
|
||||
CAFFE_ENFORCE(channels == kInputChannels);
|
||||
// RGB
|
||||
output->Resize(num_images, height, width, kOutputChannels);
|
||||
|
||||
const auto* mean_data = mean.template data<float>();
|
||||
auto* output_buffer = output->template mutable_data<uint8_t>();
|
||||
|
||||
GLImageVector<uint8_t>* output_images =
|
||||
ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels, true);
|
||||
|
||||
if (!glStylizer_) {
|
||||
glStylizer_.reset(new GLStylizer(true));
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
auto input_image = input[i];
|
||||
auto output_image = (*output_images)[i];
|
||||
glStylizer_->stylize(input_image, output_image, mean_data, 0);
|
||||
|
||||
output_image->textures[0]->map_read([&](const void* buffer,
|
||||
size_t width,
|
||||
size_t height,
|
||||
size_t stride,
|
||||
size_t channels,
|
||||
const GLTexture::Type& type) {
|
||||
if (width == stride) {
|
||||
memcpy(output_buffer, buffer, channels * width * height);
|
||||
} else {
|
||||
typedef uint8_t(input_data_t)[height][stride][channels];
|
||||
typedef uint8_t(output_data_t)[height][width][channels];
|
||||
|
||||
const input_data_t& input_data = *reinterpret_cast<const input_data_t*>(buffer);
|
||||
output_data_t& output_data = *reinterpret_cast<output_data_t*>(output_buffer);
|
||||
|
||||
for (int y = 0; y < height; y++) {
|
||||
memcpy(output_data[y], input_data[y], channels * width);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
delete output_images;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLStylizer> glStylizer_;
|
||||
};
|
||||
|
||||
template <InputFormat inputFormat>
|
||||
class OpenGLTextureToTextureStylizerDeprocessOp : public Operator<CPUContext>,
|
||||
ImageAllocator<uint8_t> {
|
||||
public:
|
||||
using Operator<CPUContext>::Operator;
|
||||
|
||||
// Expect this many channels as input
|
||||
static constexpr int kInputChannels = 3;
|
||||
|
||||
// Expect this many channels as output
|
||||
static constexpr int kOutputChannels = 4;
|
||||
|
||||
bool RunOnDevice() {
|
||||
const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
|
||||
const auto& mean = Input(1);
|
||||
|
||||
const int num_images = input.size(), channels = input.channels(), height = input.height(),
|
||||
width = input.width();
|
||||
|
||||
CAFFE_ENFORCE(mean.size() == kInputChannels);
|
||||
CAFFE_ENFORCE(channels == kInputChannels);
|
||||
|
||||
const auto* mean_data = mean.template data<float>();
|
||||
|
||||
// Use foreignTextureAllocator inside GLContext
|
||||
// glDeleteTexture will not be called from inside caffe2 for this texture
|
||||
GLImageVector<uint8_t>* output_images;
|
||||
auto textureAllocator = GLContext::getGLContext()->getTextureAllocator();
|
||||
const int tile_x = 1, tile_y = 1;
|
||||
if (textureAllocator != nullptr) {
|
||||
output_images = ImageAllocator<uint8_t>::newImage(
|
||||
num_images, width, height, kOutputChannels, tile_x, tile_y, textureAllocator);
|
||||
} else {
|
||||
// fallback when textureAllocator is not set
|
||||
output_images = ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels);
|
||||
}
|
||||
|
||||
if (!glStylizer_) {
|
||||
glStylizer_.reset(new GLStylizer(true, inputFormat));
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
auto input_image = input[i];
|
||||
auto output_image = (*output_images)[i];
|
||||
glStylizer_->stylize(input_image, output_image, mean_data, 0);
|
||||
}
|
||||
|
||||
Outputs()[0]->Reset(output_images);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLStylizer> glStylizer_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLTextureToTensorStylizerDeprocess,
|
||||
OpenGLTextureToTensorStylizerDeprocessOp);
|
||||
OPERATOR_SCHEMA(OpenGLTextureToTensorStylizerDeprocess).NumInputs(2).NumOutputs(1);
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerDeprocess,
|
||||
OpenGLTextureToTextureStylizerDeprocessOp<RGBA>);
|
||||
OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerDeprocess).NumInputs(2).NumOutputs(1);
|
||||
} // namespace caffe2
|
@ -1,133 +0,0 @@
|
||||
|
||||
#include "../core/GLFilter.h"
|
||||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLSub : public GLFilter {
|
||||
public:
|
||||
binding* inputData[2];
|
||||
binding* outputSize;
|
||||
|
||||
GLSub()
|
||||
: GLFilter("GLSub",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{/* no replacements */}) {}
|
||||
|
||||
template <typename T>
|
||||
void sub(const GLImageVector<T>& input_image0,
|
||||
const GLImageVector<T>& input_image1,
|
||||
const GLImageVector<T>& output_image);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLSub::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData[2]);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
|
||||
vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
|
||||
vec4 value = A - B;
|
||||
outputData = TEXTURE_STORE(value);}
|
||||
|
||||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLSub::sub(const GLImageVector<T>& input_images0,
|
||||
const GLImageVector<T>& input_images1,
|
||||
const GLImageVector<T>& output_images) {
|
||||
const int num_images = input_images0.size();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* input_image0 = input_images0[i];
|
||||
GLImage<T>* input_image1 = input_images1[i];
|
||||
int input_slices = input_image0->slices;
|
||||
GLImage<T>* output_image = output_images[i];
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments;
|
||||
input_attachments.push_back({input_image0->textures[is], inputData[0]});
|
||||
input_attachments.push_back({input_image1->textures[is], inputData[1]});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
template <typename T>
|
||||
class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
||||
public:
|
||||
OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(input0.size(), input1.size());
|
||||
|
||||
const int num_images = input0.size();
|
||||
const int input_channels = input0.channels();
|
||||
const int input_width = input0.width();
|
||||
const int input_height = input0.height();
|
||||
CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
|
||||
CAFFE_ENFORCE_EQ(input1.width(), input_width);
|
||||
CAFFE_ENFORCE_EQ(input1.height(), input_height);
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
|
||||
if (!_sub) {
|
||||
_sub.reset(new GLSub());
|
||||
}
|
||||
|
||||
_sub->sub(input0, input1, *output);
|
||||
|
||||
Outputs()[0]->Reset(output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<GLSub> _sub;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(OpenGLSub, OpenGLSubOp<float16_t>);
|
||||
OPERATOR_SCHEMA(OpenGLSub).NumInputs(2).NumOutputs(1);
|
||||
} // namespace caffe2
|
@ -1,33 +0,0 @@
|
||||
#pragma once
|
||||
#include <cmath>
|
||||
|
||||
struct point {
|
||||
int x;
|
||||
int y;
|
||||
};
|
||||
|
||||
struct tile_descriptor {
|
||||
point tile_dims;
|
||||
point tile_size;
|
||||
int tiles;
|
||||
};
|
||||
|
||||
namespace caffe2 {
|
||||
inline static void squareFactors(int N, int& r1, int& r2) {
|
||||
int f = sqrt(N);
|
||||
|
||||
if (f * f == N) {
|
||||
r1 = r2 = f;
|
||||
} else {
|
||||
while (N % f != 0) {
|
||||
f--;
|
||||
}
|
||||
r1 = N / f;
|
||||
r2 = f;
|
||||
}
|
||||
}
|
||||
|
||||
inline static void computeOutputTiles(int output_channels, int& output_tile_x, int& output_tile_y) {
|
||||
squareFactors((output_channels + 3) / 4, output_tile_x, output_tile_y);
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,381 +0,0 @@
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
#include "../core/GL.h"
|
||||
#include "../core/GLLogging.h"
|
||||
#include "../core/arm_neon_support.h"
|
||||
#include "../operators/gl_tiling_utils.h"
|
||||
#include "TestGLConvolution.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void AddNoiseInput(const std::vector<int64_t>& shape,
|
||||
const std::string& name,
|
||||
caffe2::Workspace* ws) {
|
||||
caffe2::CPUContext context;
|
||||
caffe2::Blob* blob = ws->CreateBlob(name);
|
||||
auto* tensor = blob->GetMutable<caffe2::TensorCPU>();
|
||||
tensor->Resize(shape);
|
||||
|
||||
caffe2::math::RandGaussian<float, caffe2::CPUContext>(
|
||||
tensor->size(), 0.0f, 10.0f, tensor->mutable_data<float>(), &context);
|
||||
}
|
||||
|
||||
double BenchOp(const std::string& typ,
|
||||
int inputC,
|
||||
int outputC,
|
||||
int kW,
|
||||
int kH,
|
||||
int stride,
|
||||
int inW,
|
||||
int inH,
|
||||
bool transposed,
|
||||
caffe2::Workspace* ws = nullptr) {
|
||||
caffe2::Workspace localWs;
|
||||
if (!ws) {
|
||||
ws = &localWs;
|
||||
}
|
||||
|
||||
const char* engine = transposed ? "MOBILE" : "NNPACK";
|
||||
|
||||
caffe2::OperatorDef def1;
|
||||
def1.set_name("test");
|
||||
def1.set_type(typ);
|
||||
def1.set_engine(engine);
|
||||
def1.add_input("X");
|
||||
def1.add_input("W");
|
||||
def1.add_input("B");
|
||||
def1.add_output("Y");
|
||||
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_h", kH));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_w", kW));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_h", stride));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_w", stride));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_t", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
|
||||
|
||||
AddNoiseInput(std::vector<int64_t>{1, inputC, inH, inW}, "X", ws);
|
||||
if (transposed) {
|
||||
AddNoiseInput(std::vector<int64_t>{inputC, outputC, kH, kW}, "W", ws);
|
||||
} else {
|
||||
AddNoiseInput(std::vector<int64_t>{outputC, inputC, kH, kW}, "W", ws);
|
||||
}
|
||||
AddNoiseInput(std::vector<int64_t>{outputC}, "B", ws);
|
||||
|
||||
std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
|
||||
|
||||
// Measure one iteration
|
||||
caffe2::Timer timer;
|
||||
timer.Start();
|
||||
|
||||
op1->Run();
|
||||
|
||||
float one_iteration = timer.MilliSeconds();
|
||||
|
||||
int target_iterations = std::max((int)(1000 / one_iteration), 1);
|
||||
int warmup_iterations = std::max((int)(200 / one_iteration), 1);
|
||||
|
||||
// warm up
|
||||
for (int i = 0; i < warmup_iterations; i++) {
|
||||
op1->Run();
|
||||
}
|
||||
|
||||
timer.Start();
|
||||
|
||||
int runs = target_iterations;
|
||||
for (int i = 0; i < runs; i++) {
|
||||
op1->Run();
|
||||
}
|
||||
|
||||
auto total_t = timer.MilliSeconds();
|
||||
|
||||
gl_log(GL_LOG,
|
||||
"%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
|
||||
typ.c_str(),
|
||||
inputC,
|
||||
outputC,
|
||||
inW,
|
||||
inH,
|
||||
kW,
|
||||
kH,
|
||||
engine,
|
||||
timer.MilliSeconds() / (float)runs);
|
||||
return double(total_t) / runs;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static double BenchGLConvolution(int input_channels,
|
||||
int output_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
int input_width,
|
||||
int input_height,
|
||||
int input_padding,
|
||||
int input_stride,
|
||||
bool transposed,
|
||||
caffe2::Workspace* ws = nullptr) {
|
||||
int tile_x = 1, tile_y = 1;
|
||||
caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
|
||||
|
||||
gl_log(GL_LOG, "Input Tiles Factors: %d, %d\n", tile_x, tile_y);
|
||||
|
||||
caffe2::Workspace localWs;
|
||||
if (!ws) {
|
||||
ws = &localWs;
|
||||
}
|
||||
|
||||
AddNoiseInput(
|
||||
std::vector<int64_t>{1, input_channels, input_height, input_width}, "X_cpu", ws);
|
||||
if (transposed) {
|
||||
AddNoiseInput(
|
||||
std::vector<int64_t>{input_channels, output_channels, kernel_height, kernel_width},
|
||||
"W",
|
||||
ws);
|
||||
} else {
|
||||
AddNoiseInput(
|
||||
std::vector<int64_t>{output_channels, input_channels, kernel_height, kernel_width},
|
||||
"W",
|
||||
ws);
|
||||
}
|
||||
AddNoiseInput(std::vector<int64_t>{output_channels}, "b", ws);
|
||||
|
||||
caffe2::NetDef netdef;
|
||||
{
|
||||
auto& op = *(netdef.add_op());
|
||||
op.set_type("CopyToOpenGL");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_gl");
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_x");
|
||||
arg.set_i(tile_x);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_y");
|
||||
arg.set_i(tile_y);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto& op = *(netdef.add_op());
|
||||
op.set_type(transposed ? "OpenGLConvTranspose" : "OpenGLConv");
|
||||
op.add_input("X_gl");
|
||||
{
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("kernel");
|
||||
arg.set_i(kernel_height);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("pad");
|
||||
arg.set_i(input_padding);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("stride");
|
||||
arg.set_i(input_stride);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("is_last");
|
||||
arg.set_i(1);
|
||||
}
|
||||
op.add_output("Y_gl");
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
|
||||
|
||||
for (auto& op : netdef.op()) {
|
||||
ops.push_back(CreateOperator(op, ws));
|
||||
}
|
||||
|
||||
// Run the Copy Operator
|
||||
ops[0]->Run();
|
||||
|
||||
// Make sure the tested operator is precompiled
|
||||
ops[1]->Run();
|
||||
glFinish();
|
||||
|
||||
// Measure one iteration
|
||||
caffe2::Timer timer;
|
||||
timer.Start();
|
||||
|
||||
ops[1]->Run();
|
||||
glFinish();
|
||||
|
||||
float one_iteration = timer.MilliSeconds();
|
||||
|
||||
int target_iterations = std::max((int)(1000 / one_iteration), 1);
|
||||
int warmup_iterations = std::max((int)(200 / one_iteration), 1);
|
||||
|
||||
// warm up
|
||||
for (int i = 0; i < warmup_iterations; i++) {
|
||||
ops[1]->Run();
|
||||
}
|
||||
glFinish();
|
||||
|
||||
timer.Start();
|
||||
|
||||
int runs = target_iterations;
|
||||
for (int i = 0; i < runs; i++) {
|
||||
ops[1]->Run();
|
||||
}
|
||||
glFinish();
|
||||
|
||||
const double gpuIterTime = double(timer.MilliSeconds()) / runs;
|
||||
|
||||
gl_log(GL_LOG,
|
||||
"%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
|
||||
transposed ? "ConvTranspose" : "Conv",
|
||||
input_channels,
|
||||
output_channels,
|
||||
input_width,
|
||||
input_height,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
gpuIterTime);
|
||||
|
||||
return gpuIterTime;
|
||||
}
|
||||
|
||||
void TestGLConvolution() {
|
||||
caffe2::Workspace ws;
|
||||
ws.GetThreadPool()->setMinWorkSize(0);
|
||||
|
||||
// small input sizes
|
||||
// std::vector<int> sizes({14, 26, 52, 104});
|
||||
// std::vector<int> channels({128, 64}); // not working for 512 and 256 channels yet
|
||||
// std::vector<int> channels({512, 256, 128, 64});
|
||||
|
||||
// large input sizes
|
||||
// std::vector<int> sizes({208, 312, 416, 720, 1080});
|
||||
// std::vector<int> channels({16, 4});
|
||||
//
|
||||
std::vector<int> sizes({14, 26, 52, 104, 208});
|
||||
// std::vector<int> channels({24, 16, 4});
|
||||
|
||||
// std::vector<int> sizes({14});
|
||||
std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
|
||||
|
||||
std::vector<int> kernels({3});
|
||||
|
||||
bool transposed = false;
|
||||
|
||||
int stride = 1;
|
||||
|
||||
for (const auto& space : sizes) {
|
||||
for (const auto& input_channel : channels) {
|
||||
int output_channel = input_channel;
|
||||
/* for (const auto& output_channel : channels) */ {
|
||||
for (const auto& kernel : kernels) {
|
||||
const double gpuIterTime = BenchGLConvolution<float16_t>(
|
||||
input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
|
||||
const double cpuIterTime = BenchOp(transposed ? "ConvTranspose" : "Conv",
|
||||
input_channel,
|
||||
output_channel,
|
||||
kernel,
|
||||
kernel,
|
||||
stride,
|
||||
space,
|
||||
space,
|
||||
transposed,
|
||||
&ws);
|
||||
const double flops = double(input_channel) * output_channel * kernel * kernel *
|
||||
(kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
|
||||
// gl_log(GL_LOG,
|
||||
printf(
|
||||
"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
|
||||
"%.2f\tratio: "
|
||||
"%.2f\n",
|
||||
space,
|
||||
space,
|
||||
input_channel,
|
||||
output_channel,
|
||||
kernel,
|
||||
kernel,
|
||||
flops / gpuIterTime / 1E6,
|
||||
flops / cpuIterTime / 1E6,
|
||||
cpuIterTime / gpuIterTime);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// // ConvTranspose
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 2, true);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 4, 4, 640, 360, 0, 2, true);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 5, 5, 640, 360, 0, 2, true);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 6, 6, 640, 360, 0, 2, true);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 7, 7, 640, 360, 0, 2, true);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 8, 8, 640, 360, 0, 2, true);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 9, 9, 640, 360, 0, 2, true);
|
||||
//
|
||||
// BenchOp("ConvTranspose", 16, 16, 3, 3, 2, 640, 360, true);
|
||||
// BenchOp("ConvTranspose", 16, 16, 4, 4, 2, 640, 360, true);
|
||||
// BenchOp("ConvTranspose", 16, 16, 5, 5, 2, 640, 360, true);
|
||||
// BenchOp("ConvTranspose", 16, 16, 6, 6, 2, 640, 360, true);
|
||||
// BenchOp("ConvTranspose", 16, 16, 7, 7, 2, 640, 360, true);
|
||||
// BenchOp("ConvTranspose", 16, 16, 8, 8, 2, 640, 360, true);
|
||||
// BenchOp("ConvTranspose", 16, 16, 9, 9, 2, 640, 360, true);
|
||||
//
|
||||
// // Conv
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 4, 4, 1280, 720, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 5, 5, 1280, 720, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 6, 6, 1280, 720, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 7, 7, 1280, 720, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 8, 8, 1280, 720, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 9, 9, 1280, 720, 0, 1, false);
|
||||
//
|
||||
// BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
|
||||
// BenchOp("Conv", 16, 16, 4, 4, 1, 1280, 720, false);
|
||||
// BenchOp("Conv", 16, 16, 5, 5, 1, 1280, 720, false);
|
||||
// BenchOp("Conv", 16, 16, 6, 6, 1, 1280, 720, false);
|
||||
// BenchOp("Conv", 16, 16, 7, 7, 1, 1280, 720, false);
|
||||
// BenchOp("Conv", 16, 16, 8, 8, 1, 1280, 720, false);
|
||||
// BenchOp("Conv", 16, 16, 9, 9, 1, 1280, 720, false);
|
||||
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 80, 45, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 160, 90, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 320, 180, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
|
||||
//
|
||||
// BenchOp("Conv", 16, 16, 3, 3, 1, 80, 45, false);
|
||||
// BenchOp("Conv", 16, 16, 3, 3, 1, 160, 90, false);
|
||||
// BenchOp("Conv", 16, 16, 3, 3, 1, 320, 180, false);
|
||||
// BenchOp("Conv", 16, 16, 3, 3, 1, 640, 360, false);
|
||||
// BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
|
||||
//
|
||||
// BenchGLConvolution<float16_t>(128, 128, 3, 3, 14, 14, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(256, 256, 3, 3, 14, 14, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(128, 128, 3, 3, 28, 28, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(256, 256, 3, 3, 28, 28, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(128, 128, 3, 3, 56, 56, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(256, 256, 3, 3, 56, 56, 0, 1, false);
|
||||
// BenchGLConvolution<float16_t>(64, 64, 7, 7, 128, 128, 0, 1, false);
|
||||
//
|
||||
// BenchOp("Conv", 128, 128, 3, 3, 1, 14, 14, false);
|
||||
// BenchOp("Conv", 256, 256, 3, 3, 1, 14, 14, false);
|
||||
// BenchOp("Conv", 128, 128, 3, 3, 1, 28, 28, false);
|
||||
// BenchOp("Conv", 256, 256, 3, 3, 1, 28, 28, false);
|
||||
// BenchOp("Conv", 128, 128, 3, 3, 1, 56, 56, false);
|
||||
// BenchOp("Conv", 256, 256, 3, 3, 1, 56, 56, false);
|
||||
// BenchOp("Conv", 64, 64, 7, 7, 1, 128, 128, false);
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
void TestGLConvolution();
|
File diff suppressed because it is too large
Load Diff
@ -1,38 +0,0 @@
|
||||
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
void testOpenGL();
|
||||
void compareModelsForOpenGL(std::string name,
|
||||
const NetDef& initNet,
|
||||
NetDef predictNet,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
std::string input_type,
|
||||
std::string input_order);
|
||||
|
||||
void compareBatchedToTiledModels(std::string name,
|
||||
const NetDef& initNet,
|
||||
NetDef predictNet,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
std::string input_type,
|
||||
std::string input_order);
|
||||
|
||||
int runModelBenchmarks(caffe2::NetDef& init_net,
|
||||
caffe2::NetDef& predict_net,
|
||||
int warm_up_runs,
|
||||
int main_runs,
|
||||
int channel,
|
||||
int height,
|
||||
int width,
|
||||
std::string input_type,
|
||||
std::string input_order,
|
||||
std::string engine,
|
||||
bool run_individual = false,
|
||||
bool use_texture_input = false,
|
||||
bool use_tiling = false,
|
||||
bool run_fusion = true);
|
||||
} // namespace caffe2
|
@ -834,23 +834,13 @@ if(USE_PROF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (USE_MOBILE_OPENGL)
|
||||
if (ANDROID)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
|
||||
elseif (IOS)
|
||||
message(STATUS "TODO item for adding ios opengl dependency")
|
||||
else()
|
||||
message(WARNING "mobile opengl is only used in android or ios builds.")
|
||||
caffe2_update_option(USE_MOBILE_OPENGL OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ ARM Compute Library: check compatibility.
|
||||
if (USE_ACL)
|
||||
if (NOT ANDROID)
|
||||
message(WARNING "ARM Compute Library is only supported for Android builds.")
|
||||
caffe2_update_option(USE_ACL OFF)
|
||||
else()
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
|
||||
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
|
||||
set(ACL_ARCH "armv7a")
|
||||
|
@ -98,7 +98,6 @@ function (caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_METAL : ${USE_METAL}")
|
||||
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
|
||||
message(STATUS " USE_MKLDNN : ${CAFFE2_USE_MKLDNN}")
|
||||
message(STATUS " USE_MOBILE_OPENGL : ${USE_MOBILE_OPENGL}")
|
||||
message(STATUS " USE_NCCL : ${USE_NCCL}")
|
||||
if(${USE_NCCL})
|
||||
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
|
||||
|
@ -85,9 +85,6 @@ CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK")
|
||||
CMAKE_ARGS+=("-DANDROID_ABI=armeabi-v7a with NEON")
|
||||
CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=21")
|
||||
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
|
||||
# TODO: As the toolchain file doesn't support NEON-FP16 extension,
|
||||
# we disable USE_MOBILE_OPENGL for now, it will be re-enabled in the future.
|
||||
CMAKE_ARGS+=("-DUSE_MOBILE_OPENGL=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
Reference in New Issue
Block a user