mirror of
https://github.com/yquake2/yquake2remaster.git
synced 2024-11-22 20:51:31 +00:00
Try to make GL3_BufferAndDraw3D() faster on AMD/Windows
Seems like AMDs Windows driver doesn't like it when we call glBufferData() *a lot* (other drivers, incl. Intels, don't seem to care as much). Even on an i7-4771 with a Radeon RX 580 I couldn't get stable 60fps on Windows without this workaround (the open source Linux driver is ok). This workaround can be enabled/disabled with the gl3_usebigvbo cvar; by default it's -1 which means "enable if AMD driver is detected". Enabling it when using a nvidia GPU with their proprietary drivers reduces the performance to 1/3 of the fps we get without it, so it indeed needs to be conditional...
This commit is contained in:
parent
7b4dc000ad
commit
26a461575b
4 changed files with 123 additions and 4 deletions
|
@ -121,6 +121,7 @@ cvar_t *r_modulate;
|
|||
cvar_t *gl_lightmap;
|
||||
cvar_t *gl_shadows;
|
||||
cvar_t *gl3_debugcontext;
|
||||
cvar_t *gl3_usebigvbo;
|
||||
|
||||
// Yaw-Pitch-Roll
|
||||
// equivalent to R_z * R_y * R_x where R_x is the trans matrix for rotating around X axis for aroundXdeg
|
||||
|
@ -206,6 +207,11 @@ GL3_Register(void)
|
|||
gl3_particle_fade_factor = ri.Cvar_Get("gl3_particle_fade_factor", "1.2", CVAR_ARCHIVE);
|
||||
gl3_particle_square = ri.Cvar_Get("gl3_particle_square", "0", CVAR_ARCHIVE);
|
||||
|
||||
// 0: use lots of calls to glBufferData()
|
||||
// 1: reduce calls to glBufferData() with one big VBO (see GL3_BufferAndDraw3D())
|
||||
// -1: auto (let yq2 choose to enable/disable this based on detected driver)
|
||||
gl3_usebigvbo = ri.Cvar_Get("gl3_usebigvbo", "-1", CVAR_ARCHIVE);
|
||||
|
||||
r_norefresh = ri.Cvar_Get("r_norefresh", "0", 0);
|
||||
r_drawentities = ri.Cvar_Get("r_drawentities", "1", 0);
|
||||
r_drawworld = ri.Cvar_Get("r_drawworld", "1", 0);
|
||||
|
@ -530,6 +536,34 @@ GL3_Init(void)
|
|||
R_Printf(PRINT_ALL, " - OpenGL Debug Output: Not Supported\n");
|
||||
}
|
||||
|
||||
gl3config.useBigVBO = false;
|
||||
if(gl3_usebigvbo->value == 1.0f)
|
||||
{
|
||||
R_Printf(PRINT_ALL, "Enabling useBigVBO workaround because gl3_usebigvbo = 1\n");
|
||||
gl3config.useBigVBO = true;
|
||||
}
|
||||
else if(gl3_usebigvbo->value == -1.0f)
|
||||
{
|
||||
// enable for AMDs proprietary Windows and Linux drivers
|
||||
// TODO: should we match a version number? does the workaround
|
||||
// slow down legacy drivers that work fine without it?
|
||||
// This workaround is is tested with the following configuration:
|
||||
// RX580, Win10, driver version "Adrenalin 2019 Edition 19.4.x"
|
||||
#ifdef _WIN32
|
||||
if(gl3config.vendor_string != NULL && strstr(gl3config.vendor_string, "ATI") != NULL)
|
||||
{
|
||||
R_Printf(PRINT_ALL, "Detected AMD Windows GPU driver, enabling useBigVBO workaround\n");
|
||||
gl3config.useBigVBO = true;
|
||||
}
|
||||
#elif defined(__linux__)
|
||||
if(gl3config.vendor_string != NULL && strstr(gl3config.vendor_string, "Advanced Micro Devices, Inc.") != NULL)
|
||||
{
|
||||
R_Printf(PRINT_ALL, "Detected proprietary AMD GPU driver, enabling useBigVBO workaround\n");
|
||||
gl3config.useBigVBO = true;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// generate texture handles for all possible lightmaps
|
||||
glGenTextures(MAX_LIGHTMAPS*MAX_LIGHTMAPS_PER_SURFACE, gl3state.lightmap_textureIDs[0]);
|
||||
|
||||
|
@ -589,10 +623,72 @@ GL3_Shutdown(void)
|
|||
void
|
||||
GL3_BufferAndDraw3D(const gl3_3D_vtx_t* verts, int numVerts, GLenum drawMode)
|
||||
{
|
||||
// TODO: do something more efficient, maybe with glMapBufferRange() + GL_MAP_UNSYNCHRONIZED_BIT
|
||||
// and glBindBufferRange()
|
||||
glBufferData( GL_ARRAY_BUFFER, sizeof(gl3_3D_vtx_t)*numVerts, verts, GL_STREAM_DRAW );
|
||||
glDrawArrays( drawMode, 0, numVerts );
|
||||
if(!gl3config.useBigVBO)
|
||||
{
|
||||
glBufferData( GL_ARRAY_BUFFER, sizeof(gl3_3D_vtx_t)*numVerts, verts, GL_STREAM_DRAW );
|
||||
glDrawArrays( drawMode, 0, numVerts );
|
||||
}
|
||||
else // gl3config.useBigVBO == true
|
||||
{
|
||||
/*
|
||||
* For some reason, AMD's Windows driver doesn't seem to like lots of
|
||||
* calls to glBufferData() (some of them seem to take very long then).
|
||||
* GL3_BufferAndDraw3D() is called a lot when drawing world geometry
|
||||
* (once for each visible face I think?).
|
||||
* The simple code above caused noticeable slowdowns - even a fast
|
||||
* quadcore CPU and a Radeon RX580 weren't able to maintain 60fps..
|
||||
* The workaround is to not call glBufferData() with small data all the time,
|
||||
* but to allocate a big buffer and on each call to GL3_BufferAndDraw3D()
|
||||
* to use a different region of that buffer, resulting in a lot less calls
|
||||
* to glBufferData() (=> a lot less buffer allocations in the driver).
|
||||
* Only when the buffer is full and at the end of a frame (=> GL3_EndFrame())
|
||||
* we get a fresh buffer.
|
||||
*
|
||||
* BTW, we couldn't observe this kind of problem with any other driver:
|
||||
* Neither nvidias driver, nor AMDs or Intels Open Source Linux drivers,
|
||||
* not even Intels Windows driver seem to care that much about the
|
||||
* glBufferData() calls.. However, at least nvidias driver doesn't like
|
||||
* this workaround (with glMapBufferRange()), the framerate dropped
|
||||
* significantly - that's why both methods are available and
|
||||
* selectable at runtime.
|
||||
*/
|
||||
#if 0
|
||||
// I /think/ doing it with glBufferSubData() didn't really help
|
||||
const int bufSize = gl3state.vbo3Dsize;
|
||||
int neededSize = numVerts*sizeof(gl3_3D_vtx_t);
|
||||
int curOffset = gl3state.vbo3DcurOffset;
|
||||
if(curOffset + neededSize > gl3state.vbo3Dsize)
|
||||
curOffset = 0;
|
||||
int curIdx = curOffset / sizeof(gl3_3D_vtx_t);
|
||||
|
||||
gl3state.vbo3DcurOffset = curOffset + neededSize;
|
||||
|
||||
glBufferSubData( GL_ARRAY_BUFFER, curOffset, neededSize, verts );
|
||||
glDrawArrays( drawMode, curIdx, numVerts );
|
||||
#else
|
||||
int curOffset = gl3state.vbo3DcurOffset;
|
||||
int neededSize = numVerts*sizeof(gl3_3D_vtx_t);
|
||||
if(curOffset+neededSize > gl3state.vbo3Dsize)
|
||||
{
|
||||
// buffer is full, need to start again from the beginning
|
||||
// => need to sync or get fresh buffer
|
||||
// (getting fresh buffer seems easier)
|
||||
glBufferData(GL_ARRAY_BUFFER, gl3state.vbo3Dsize, NULL, GL_STREAM_DRAW);
|
||||
curOffset = 0;
|
||||
}
|
||||
|
||||
// as we make sure to use a previously unused part of the buffer,
|
||||
// doing it unsynchronized should be safe..
|
||||
GLbitfield accessBits = GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT;
|
||||
void* data = glMapBufferRange(GL_ARRAY_BUFFER, curOffset, neededSize, accessBits);
|
||||
memcpy(data, verts, neededSize);
|
||||
glUnmapBuffer(GL_ARRAY_BUFFER);
|
||||
|
||||
glDrawArrays(drawMode, curOffset/sizeof(gl3_3D_vtx_t), numVerts);
|
||||
|
||||
gl3state.vbo3DcurOffset = curOffset + neededSize; // TODO: padding or sth needed?
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -98,6 +98,15 @@ DebugCallback(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei le
|
|||
*/
|
||||
void GL3_EndFrame(void)
|
||||
{
|
||||
if(gl3config.useBigVBO)
|
||||
{
|
||||
// I think this is a good point to orphan the VBO and get a fresh one
|
||||
GL3_BindVAO(gl3state.vao3D);
|
||||
GL3_BindVBO(gl3state.vbo3D);
|
||||
glBufferData(GL_ARRAY_BUFFER, gl3state.vbo3Dsize, NULL, GL_STREAM_DRAW);
|
||||
gl3state.vbo3DcurOffset = 0;
|
||||
}
|
||||
|
||||
SDL_GL_SwapWindow(window);
|
||||
}
|
||||
|
||||
|
|
|
@ -52,6 +52,13 @@ void GL3_SurfInit(void)
|
|||
glGenBuffers(1, &gl3state.vbo3D);
|
||||
GL3_BindVBO(gl3state.vbo3D);
|
||||
|
||||
if(gl3config.useBigVBO)
|
||||
{
|
||||
gl3state.vbo3Dsize = 5*1024*1024; // a 5MB buffer seems to work well?
|
||||
gl3state.vbo3DcurOffset = 0;
|
||||
glBufferData(GL_ARRAY_BUFFER, gl3state.vbo3Dsize, NULL, GL_STREAM_DRAW); // allocate/reserve that data
|
||||
}
|
||||
|
||||
glEnableVertexAttribArray(GL3_ATTRIB_POSITION);
|
||||
qglVertexAttribPointer(GL3_ATTRIB_POSITION, 3, GL_FLOAT, GL_FALSE, sizeof(gl3_3D_vtx_t), 0);
|
||||
|
||||
|
|
|
@ -106,6 +106,8 @@ typedef struct
|
|||
qboolean debug_output; // is GL_ARB_debug_output supported?
|
||||
qboolean stencil; // Do we have a stencil buffer?
|
||||
|
||||
qboolean useBigVBO; // workaround for AMDs windows driver for fewer calls to glBufferData()
|
||||
|
||||
// ----
|
||||
|
||||
float max_anisotropy;
|
||||
|
@ -226,6 +228,11 @@ typedef struct
|
|||
gl3ShaderInfo_t siParticle; // for particles. surprising, right?
|
||||
|
||||
GLuint vao3D, vbo3D; // for brushes etc, using 10 floats and one uint as vertex input (x,y,z, s,t, lms,lmt, normX,normY,normZ ; lightFlags)
|
||||
|
||||
// the next two are for gl3config.useBigVBO == true
|
||||
int vbo3Dsize;
|
||||
int vbo3DcurOffset;
|
||||
|
||||
GLuint vaoAlias, vboAlias, eboAlias; // for models, using 9 floats as (x,y,z, s,t, r,g,b,a)
|
||||
GLuint vaoParticle, vboParticle; // for particles, using 9 floats (x,y,z, size,distance, r,g,b,a)
|
||||
|
||||
|
|
Loading…
Reference in a new issue