Try to make GL3_BufferAndDraw3D() faster on AMD/Windows

Seems like AMDs Windows driver doesn't like it when we call
glBufferData() *a lot* (other drivers, incl. Intels, don't seem to
care as much).
Even on an i7-4771 with a Radeon RX 580 I couldn't get stable 60fps
on Windows without this workaround (the open source Linux driver is ok).

This workaround can be enabled/disabled with the gl3_usebigvbo cvar;
by default it's -1 which means "enable if AMD driver is detected".

Enabling it when using a nvidia GPU with their proprietary drivers
reduces the performance to 1/3 of the fps we get without it, so it
indeed needs to be conditional...
This commit is contained in:
Daniel Gibson 2019-04-28 06:14:50 +02:00
parent 7b4dc000ad
commit 26a461575b
4 changed files with 123 additions and 4 deletions

View file

@ -121,6 +121,7 @@ cvar_t *r_modulate;
cvar_t *gl_lightmap;
cvar_t *gl_shadows;
cvar_t *gl3_debugcontext;
cvar_t *gl3_usebigvbo;
// Yaw-Pitch-Roll
// equivalent to R_z * R_y * R_x where R_x is the trans matrix for rotating around X axis for aroundXdeg
@ -206,6 +207,11 @@ GL3_Register(void)
gl3_particle_fade_factor = ri.Cvar_Get("gl3_particle_fade_factor", "1.2", CVAR_ARCHIVE);
gl3_particle_square = ri.Cvar_Get("gl3_particle_square", "0", CVAR_ARCHIVE);
// 0: use lots of calls to glBufferData()
// 1: reduce calls to glBufferData() with one big VBO (see GL3_BufferAndDraw3D())
// -1: auto (let yq2 choose to enable/disable this based on detected driver)
gl3_usebigvbo = ri.Cvar_Get("gl3_usebigvbo", "-1", CVAR_ARCHIVE);
r_norefresh = ri.Cvar_Get("r_norefresh", "0", 0);
r_drawentities = ri.Cvar_Get("r_drawentities", "1", 0);
r_drawworld = ri.Cvar_Get("r_drawworld", "1", 0);
@ -530,6 +536,34 @@ GL3_Init(void)
R_Printf(PRINT_ALL, " - OpenGL Debug Output: Not Supported\n");
}
gl3config.useBigVBO = false;
if(gl3_usebigvbo->value == 1.0f)
{
R_Printf(PRINT_ALL, "Enabling useBigVBO workaround because gl3_usebigvbo = 1\n");
gl3config.useBigVBO = true;
}
else if(gl3_usebigvbo->value == -1.0f)
{
// enable for AMDs proprietary Windows and Linux drivers
// TODO: should we match a version number? does the workaround
// slow down legacy drivers that work fine without it?
// This workaround is is tested with the following configuration:
// RX580, Win10, driver version "Adrenalin 2019 Edition 19.4.x"
#ifdef _WIN32
if(gl3config.vendor_string != NULL && strstr(gl3config.vendor_string, "ATI") != NULL)
{
R_Printf(PRINT_ALL, "Detected AMD Windows GPU driver, enabling useBigVBO workaround\n");
gl3config.useBigVBO = true;
}
#elif defined(__linux__)
if(gl3config.vendor_string != NULL && strstr(gl3config.vendor_string, "Advanced Micro Devices, Inc.") != NULL)
{
R_Printf(PRINT_ALL, "Detected proprietary AMD GPU driver, enabling useBigVBO workaround\n");
gl3config.useBigVBO = true;
}
#endif
}
// generate texture handles for all possible lightmaps
glGenTextures(MAX_LIGHTMAPS*MAX_LIGHTMAPS_PER_SURFACE, gl3state.lightmap_textureIDs[0]);
@ -589,10 +623,72 @@ GL3_Shutdown(void)
void
GL3_BufferAndDraw3D(const gl3_3D_vtx_t* verts, int numVerts, GLenum drawMode)
{
// TODO: do something more efficient, maybe with glMapBufferRange() + GL_MAP_UNSYNCHRONIZED_BIT
// and glBindBufferRange()
glBufferData( GL_ARRAY_BUFFER, sizeof(gl3_3D_vtx_t)*numVerts, verts, GL_STREAM_DRAW );
glDrawArrays( drawMode, 0, numVerts );
if(!gl3config.useBigVBO)
{
glBufferData( GL_ARRAY_BUFFER, sizeof(gl3_3D_vtx_t)*numVerts, verts, GL_STREAM_DRAW );
glDrawArrays( drawMode, 0, numVerts );
}
else // gl3config.useBigVBO == true
{
/*
* For some reason, AMD's Windows driver doesn't seem to like lots of
* calls to glBufferData() (some of them seem to take very long then).
* GL3_BufferAndDraw3D() is called a lot when drawing world geometry
* (once for each visible face I think?).
* The simple code above caused noticeable slowdowns - even a fast
* quadcore CPU and a Radeon RX580 weren't able to maintain 60fps..
* The workaround is to not call glBufferData() with small data all the time,
* but to allocate a big buffer and on each call to GL3_BufferAndDraw3D()
* to use a different region of that buffer, resulting in a lot less calls
* to glBufferData() (=> a lot less buffer allocations in the driver).
* Only when the buffer is full and at the end of a frame (=> GL3_EndFrame())
* we get a fresh buffer.
*
* BTW, we couldn't observe this kind of problem with any other driver:
* Neither nvidias driver, nor AMDs or Intels Open Source Linux drivers,
* not even Intels Windows driver seem to care that much about the
* glBufferData() calls.. However, at least nvidias driver doesn't like
* this workaround (with glMapBufferRange()), the framerate dropped
* significantly - that's why both methods are available and
* selectable at runtime.
*/
#if 0
// I /think/ doing it with glBufferSubData() didn't really help
const int bufSize = gl3state.vbo3Dsize;
int neededSize = numVerts*sizeof(gl3_3D_vtx_t);
int curOffset = gl3state.vbo3DcurOffset;
if(curOffset + neededSize > gl3state.vbo3Dsize)
curOffset = 0;
int curIdx = curOffset / sizeof(gl3_3D_vtx_t);
gl3state.vbo3DcurOffset = curOffset + neededSize;
glBufferSubData( GL_ARRAY_BUFFER, curOffset, neededSize, verts );
glDrawArrays( drawMode, curIdx, numVerts );
#else
int curOffset = gl3state.vbo3DcurOffset;
int neededSize = numVerts*sizeof(gl3_3D_vtx_t);
if(curOffset+neededSize > gl3state.vbo3Dsize)
{
// buffer is full, need to start again from the beginning
// => need to sync or get fresh buffer
// (getting fresh buffer seems easier)
glBufferData(GL_ARRAY_BUFFER, gl3state.vbo3Dsize, NULL, GL_STREAM_DRAW);
curOffset = 0;
}
// as we make sure to use a previously unused part of the buffer,
// doing it unsynchronized should be safe..
GLbitfield accessBits = GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT;
void* data = glMapBufferRange(GL_ARRAY_BUFFER, curOffset, neededSize, accessBits);
memcpy(data, verts, neededSize);
glUnmapBuffer(GL_ARRAY_BUFFER);
glDrawArrays(drawMode, curOffset/sizeof(gl3_3D_vtx_t), numVerts);
gl3state.vbo3DcurOffset = curOffset + neededSize; // TODO: padding or sth needed?
#endif
}
}
static void

View file

@ -98,6 +98,15 @@ DebugCallback(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei le
*/
void GL3_EndFrame(void)
{
if(gl3config.useBigVBO)
{
// I think this is a good point to orphan the VBO and get a fresh one
GL3_BindVAO(gl3state.vao3D);
GL3_BindVBO(gl3state.vbo3D);
glBufferData(GL_ARRAY_BUFFER, gl3state.vbo3Dsize, NULL, GL_STREAM_DRAW);
gl3state.vbo3DcurOffset = 0;
}
SDL_GL_SwapWindow(window);
}

View file

@ -52,6 +52,13 @@ void GL3_SurfInit(void)
glGenBuffers(1, &gl3state.vbo3D);
GL3_BindVBO(gl3state.vbo3D);
if(gl3config.useBigVBO)
{
gl3state.vbo3Dsize = 5*1024*1024; // a 5MB buffer seems to work well?
gl3state.vbo3DcurOffset = 0;
glBufferData(GL_ARRAY_BUFFER, gl3state.vbo3Dsize, NULL, GL_STREAM_DRAW); // allocate/reserve that data
}
glEnableVertexAttribArray(GL3_ATTRIB_POSITION);
qglVertexAttribPointer(GL3_ATTRIB_POSITION, 3, GL_FLOAT, GL_FALSE, sizeof(gl3_3D_vtx_t), 0);

View file

@ -106,6 +106,8 @@ typedef struct
qboolean debug_output; // is GL_ARB_debug_output supported?
qboolean stencil; // Do we have a stencil buffer?
qboolean useBigVBO; // workaround for AMDs windows driver for fewer calls to glBufferData()
// ----
float max_anisotropy;
@ -226,6 +228,11 @@ typedef struct
gl3ShaderInfo_t siParticle; // for particles. surprising, right?
GLuint vao3D, vbo3D; // for brushes etc, using 10 floats and one uint as vertex input (x,y,z, s,t, lms,lmt, normX,normY,normZ ; lightFlags)
// the next two are for gl3config.useBigVBO == true
int vbo3Dsize;
int vbo3DcurOffset;
GLuint vaoAlias, vboAlias, eboAlias; // for models, using 9 floats as (x,y,z, s,t, r,g,b,a)
GLuint vaoParticle, vboParticle; // for particles, using 9 floats (x,y,z, size,distance, r,g,b,a)