Prepared light buffer for multithreaded use

This necessitated removing the reallocator because that cannot be done in a multithreaded context in OpenGL. The buffer should be large enough anyway, it it all gets used, slowdowns would be unavoidable.

There was also some simplification of the buffer alignment math for uniform buffers
This commit is contained in:
Christoph Oelckers 2018-08-14 10:58:58 +02:00
parent 84a55667d9
commit ad80efd6be
2 changed files with 43 additions and 83 deletions

View file

@ -32,13 +32,17 @@
#include "hwrenderer/dynlights/hw_dynlightdata.h" #include "hwrenderer/dynlights/hw_dynlightdata.h"
#include "hwrenderer/data/shaderuniforms.h" #include "hwrenderer/data/shaderuniforms.h"
static const int INITIAL_BUFFER_SIZE = 160000; // This means 80000 lights per frame and 160000*16 bytes == 2.56 MB. static const int ELEMENTS_PER_LIGHT = 4; // each light needs 4 vec4's.
static const int ELEMENT_SIZE = (4*sizeof(float));
FLightBuffer::FLightBuffer() FLightBuffer::FLightBuffer()
{ {
int maxNumberOfLights = gl.lightmethod == LM_DIRECT? 80000 : 40000;
mBufferSize = maxNumberOfLights * ELEMENTS_PER_LIGHT;
mByteSize = mBufferSize * ELEMENT_SIZE;
mBufferSize = INITIAL_BUFFER_SIZE;
mByteSize = mBufferSize * sizeof(float);
// Hack alert: On Intel's GL driver SSBO's perform quite worse than UBOs. // Hack alert: On Intel's GL driver SSBO's perform quite worse than UBOs.
// We only want to disable using SSBOs for lights but not disable the feature entirely. // We only want to disable using SSBOs for lights but not disable the feature entirely.
// Note that using an uniform buffer here will limit the number of lights per surface so it isn't done for NVidia and AMD. // Note that using an uniform buffer here will limit the number of lights per surface so it isn't done for NVidia and AMD.
@ -46,15 +50,16 @@ FLightBuffer::FLightBuffer()
{ {
mBufferType = GL_SHADER_STORAGE_BUFFER; mBufferType = GL_SHADER_STORAGE_BUFFER;
mBlockAlign = 0; mBlockAlign = 0;
mBlockSize = mBufferSize; mBlockSize = mBufferSize / ELEMENT_SIZE;
mMaxUploadSize = mBlockSize;
} }
else else
{ {
mBufferType = GL_UNIFORM_BUFFER; mBufferType = GL_UNIFORM_BUFFER;
mBlockSize = gl.maxuniformblock / 16; mBlockSize = gl.maxuniformblock / ELEMENT_SIZE;
if (mBlockSize > 2048) mBlockSize = 2048; // we don't really need a larger buffer mBlockAlign = gl.uniformblockalignment / ELEMENT_SIZE;
mMaxUploadSize = (mBlockSize - mBlockAlign);
mBlockAlign = mBlockSize / 2; mByteSize += gl.maxuniformblock; // to avoid mapping beyond the end of the buffer.
} }
glGenBuffers(1, &mBufferId); glGenBuffers(1, &mBufferId);
@ -84,25 +89,19 @@ FLightBuffer::~FLightBuffer()
void FLightBuffer::Clear() void FLightBuffer::Clear()
{ {
mIndex = 0; mIndex = 0;
mUploadIndex = 0;
} }
int FLightBuffer::UploadLights(FDynLightData &data) int FLightBuffer::UploadLights(FDynLightData &data)
{ {
// All meaasurements here are in vec4's.
int size0 = data.arrays[0].Size()/4; int size0 = data.arrays[0].Size()/4;
int size1 = data.arrays[1].Size()/4; int size1 = data.arrays[1].Size()/4;
int size2 = data.arrays[2].Size()/4; int size2 = data.arrays[2].Size()/4;
int totalsize = size0 + size1 + size2 + 1; int totalsize = size0 + size1 + size2 + 1;
// pointless type casting because some compilers can't print enough warnings. if (totalsize > (int)mMaxUploadSize)
if (mBlockAlign > 0 && (unsigned int)totalsize + (mIndex % mBlockAlign) > mBlockSize)
{ {
mIndex = ((mIndex + mBlockAlign) / mBlockAlign) * mBlockAlign; int diff = totalsize - (int)mMaxUploadSize;
// can't be rendered all at once.
if ((unsigned int)totalsize > mBlockSize)
{
int diff = totalsize - (int)mBlockSize;
size2 -= diff; size2 -= diff;
if (size2 < 0) if (size2 < 0)
@ -117,63 +116,23 @@ int FLightBuffer::UploadLights(FDynLightData &data)
} }
totalsize = size0 + size1 + size2 + 1; totalsize = size0 + size1 + size2 + 1;
} }
}
if (totalsize <= 1) return -1; assert(mBufferPointer != nullptr);
if (mBufferPointer == nullptr) return -1;
if (mIndex + totalsize > mBufferSize/4) if (totalsize <= 4 || mIndex + totalsize > mBufferSize) return -1;
{ auto thisindex = mIndex.fetch_add(totalsize);
// reallocate the buffer with twice the size if (thisindex + totalsize > mBufferSize) return -1; // must retest because another thread might have changed mIndex.
unsigned int newbuffer;
// first unmap the old buffer float *copyptr = mBufferPointer + thisindex*4;
glBindBuffer(mBufferType, mBufferId);
glUnmapBuffer(mBufferType);
// create and bind the new buffer, bind the old one to a copy target (too bad that DSA is not yet supported well enough to omit this crap.)
glGenBuffers(1, &newbuffer);
glBindBufferBase(mBufferType, LIGHTBUF_BINDINGPOINT, newbuffer);
glBindBuffer(mBufferType, newbuffer); // Note: Some older AMD drivers don't do that in glBindBufferBase, as they should.
glBindBuffer(GL_COPY_READ_BUFFER, mBufferId);
// create the new buffer's storage (twice as large as the old one)
mBufferSize *= 2;
mByteSize *= 2;
if (gl.lightmethod == LM_DIRECT)
{
glBufferStorage(mBufferType, mByteSize, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
mBufferPointer = (float*)glMapBufferRange(mBufferType, 0, mByteSize, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
}
else
{
glBufferData(mBufferType, mByteSize, NULL, GL_DYNAMIC_DRAW);
mBufferPointer = (float*)glMapBufferRange(mBufferType, 0, mByteSize, GL_MAP_WRITE_BIT|GL_MAP_INVALIDATE_BUFFER_BIT);
}
// copy contents and delete the old buffer.
glCopyBufferSubData(GL_COPY_READ_BUFFER, mBufferType, 0, 0, mByteSize/2);
glBindBuffer(GL_COPY_READ_BUFFER, 0);
glDeleteBuffers(1, &mBufferId);
mBufferId = newbuffer;
}
float *copyptr;
assert(mBufferPointer != NULL);
if (mBufferPointer == NULL) return -1;
copyptr = mBufferPointer + mIndex * 4;
float parmcnt[] = { 0, float(size0), float(size0 + size1), float(size0 + size1 + size2) }; float parmcnt[] = { 0, float(size0), float(size0 + size1), float(size0 + size1 + size2) };
memcpy(&copyptr[0], parmcnt, ELEMENT_SIZE);
memcpy(&copyptr[4], &data.arrays[0][0], size0 * ELEMENT_SIZE);
memcpy(&copyptr[4 + 4*size0], &data.arrays[1][0], size1 * ELEMENT_SIZE);
memcpy(&copyptr[4 + 4*(size0 + size1)], &data.arrays[2][0], size2 * ELEMENT_SIZE);
memcpy(&copyptr[0], parmcnt, 4 * sizeof(float)); return thisindex;
memcpy(&copyptr[4], &data.arrays[0][0], 4 * size0*sizeof(float));
memcpy(&copyptr[4 + 4*size0], &data.arrays[1][0], 4 * size1*sizeof(float));
memcpy(&copyptr[4 + 4*(size0 + size1)], &data.arrays[2][0], 4 * size2*sizeof(float));
unsigned int bufferindex = mIndex;
mIndex += totalsize;
draw_dlight += (totalsize-1) / 2;
return bufferindex;
} }
void FLightBuffer::Begin() void FLightBuffer::Begin()
@ -203,7 +162,7 @@ int FLightBuffer::BindUBO(unsigned int index)
{ {
// this will only get called if a uniform buffer is used. For a shader storage buffer we only need to bind the buffer once at the start to all shader programs // this will only get called if a uniform buffer is used. For a shader storage buffer we only need to bind the buffer once at the start to all shader programs
mLastMappedIndex = offset; mLastMappedIndex = offset;
glBindBufferRange(GL_UNIFORM_BUFFER, LIGHTBUF_BINDINGPOINT, mBufferId, offset*16, mBlockSize*16); // we go from counting vec4's to counting bytes here. glBindBufferRange(GL_UNIFORM_BUFFER, LIGHTBUF_BINDINGPOINT, mBufferId, offset * ELEMENT_SIZE, mBlockSize * ELEMENT_SIZE);
} }
return (index - offset); return (index - offset);
} }

View file

@ -3,6 +3,7 @@
#include "tarray.h" #include "tarray.h"
#include "hwrenderer/dynlights/hw_dynlightdata.h" #include "hwrenderer/dynlights/hw_dynlightdata.h"
#include <atomic>
class FLightBuffer class FLightBuffer
{ {
@ -10,13 +11,13 @@ class FLightBuffer
float * mBufferPointer; float * mBufferPointer;
unsigned int mBufferType; unsigned int mBufferType;
unsigned int mIndex; std::atomic<unsigned int> mIndex;
unsigned int mUploadIndex;
unsigned int mLastMappedIndex; unsigned int mLastMappedIndex;
unsigned int mBlockAlign; unsigned int mBlockAlign;
unsigned int mBlockSize; unsigned int mBlockSize;
unsigned int mBufferSize; unsigned int mBufferSize;
unsigned int mByteSize; unsigned int mByteSize;
unsigned int mMaxUploadSize;
public: public: