Change triangle full block to use vectored load and store

This commit is contained in:
Magnus Norddahl 2016-11-20 01:29:19 +01:00
parent 08c4f2ac18
commit 845bf93c95
5 changed files with 152 additions and 104 deletions

View file

@ -303,57 +303,80 @@ void DrawTriangleCodegen::LoopFullBlock()
{
int pixelsize = truecolor ? 4 : 1;
stack_iy.store(SSAInt(0));
stack_buffer.store(dest[x * pixelsize]);
stack_subsectorbuffer.store(subsectorGBuffer[x]);
SSAForBlock loopy;
SSAInt iy = stack_iy.load();
SSAUBytePtr buffer = stack_buffer.load();
SSAIntPtr subsectorbuffer = stack_subsectorbuffer.load();
loopy.loop_block(iy < SSAInt(q), q);
for (int iy = 0; iy < q; iy++)
{
SSAUBytePtr buffer = dest[(x + iy * pitch) * pixelsize];
SSAIntPtr subsectorbuffer = subsectorGBuffer[x + iy * pitch];
SSAInt varying[TriVertex::NumVarying];
SSAInt varyingStep[TriVertex::NumVarying];
for (int i = 0; i < TriVertex::NumVarying; i++)
{
stack_varying[i].store((varyingPos[i] + varyingStepPos[i] * iy) << 8);
varying[i] = (varyingPos[i] + varyingStepPos[i] * iy) << 8;
varyingStep[i] = (varyingStartStepX[i] + varyingIncrStepX[i] * iy) << 8;
}
stack_ix.store(SSAInt(0));
SSAForBlock loopx;
SSAInt ix = stack_ix.load();
SSAInt varying[TriVertex::NumVarying];
for (int i = 0; i < TriVertex::NumVarying; i++)
varying[i] = stack_varying[i].load();
loopx.loop_block(ix < SSAInt(q), q);
for (int ix = 0; ix < q; ix += 4)
{
SSAUBytePtr buf = buffer[ix * pixelsize];
if (truecolor)
{
SSAVec16ub pixels16 = buf.load_unaligned_vec16ub(false);
SSAVec8s pixels8hi = SSAVec8s::extendhi(pixels16);
SSAVec8s pixels8lo = SSAVec8s::extendlo(pixels16);
SSAVec4i pixels[4] =
{
SSAVec4i::extendlo(pixels8lo),
SSAVec4i::extendhi(pixels8lo),
SSAVec4i::extendlo(pixels8hi),
SSAVec4i::extendhi(pixels8hi)
};
for (int sse = 0; sse < 4; sse++)
{
if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector)
{
SSAIfBlock branch;
branch.if_block(subsectorbuffer[ix].load(true) >= subsectorDepth);
{
ProcessPixel(buffer[ix * pixelsize], subsectorbuffer[ix], varying);
}
branch.end_block();
SSABool subsectorTest = subsectorbuffer[ix].load(true) >= subsectorDepth;
pixels[sse] = subsectorTest.select(ProcessPixel32(pixels[sse], varying), pixels[sse]);
}
else
{
ProcessPixel(buffer[ix * pixelsize], subsectorbuffer[ix], varying);
pixels[sse] = ProcessPixel32(pixels[sse], varying);
}
for (int i = 0; i < TriVertex::NumVarying; i++)
stack_varying[i].store(varying[i] + varyingStep[i]);
stack_ix.store(ix + 1);
varying[i] = varying[i] + varyingStep[i];
}
loopx.end_block();
stack_buffer.store(buffer[pitch * pixelsize]);
stack_subsectorbuffer.store(subsectorbuffer[pitch]);
stack_iy.store(iy + 1);
buf.store_unaligned_vec16ub(SSAVec16ub(SSAVec8s(pixels[0], pixels[1]), SSAVec8s(pixels[2], pixels[3])));
}
else
{
SSAVec4i pixels = buf.load_vec4ub(false);
for (int sse = 0; sse < 4; sse++)
{
if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector)
{
SSABool subsectorTest = subsectorbuffer[ix].load(true) >= subsectorDepth;
pixels.insert(sse, subsectorTest.select(ProcessPixel8(pixels[sse], varying), pixels[sse]));
}
else
{
pixels.insert(sse, ProcessPixel8(pixels[sse], varying));
}
for (int i = 0; i < TriVertex::NumVarying; i++)
varying[i] = varying[i] + varyingStep[i];
}
buf.store_vec4ub(pixels);
}
if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector)
subsectorbuffer[ix].store_unaligned_vec4i(SSAVec4i(subsectorDepth));
}
}
loopy.end_block();
}
if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector)
@ -425,7 +448,21 @@ void DrawTriangleCodegen::LoopPartialBlock()
}
else
{
ProcessPixel(buffer[ix * pixelsize], subsectorbuffer[ix], varying);
SSAUBytePtr buf = buffer[ix * pixelsize];
if (truecolor)
{
SSAVec4i bg = buf.load_vec4ub(false);
buf.store_vec4ub(ProcessPixel32(bg, varying));
}
else
{
SSAUByte bg = buf.load(false);
buf.store(ProcessPixel8(bg.zext_int(), varying).trunc_ubyte());
}
if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector)
subsectorbuffer[ix].store(subsectorDepth);
}
}
branch.end_block();
@ -466,7 +503,7 @@ SSAVec4i DrawTriangleCodegen::Sample(SSAInt uvoffset)
return texturePixels[uvoffset * 4].load_vec4ub(true);
}
void DrawTriangleCodegen::ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbuffer, SSAInt *varying)
SSAVec4i DrawTriangleCodegen::ProcessPixel32(SSAVec4i bg, SSAInt *varying)
{
SSAInt ufrac = varying[0];
SSAInt vfrac = varying[1];
@ -475,10 +512,7 @@ void DrawTriangleCodegen::ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbu
SSAInt vpos = ((vfrac >> 16) * textureHeight) >> 16;
SSAInt uvoffset = upos * textureHeight + vpos;
if (truecolor)
{
SSAVec4i fg;
SSAVec4i bg = buffer.load_vec4ub(false);
SSAInt alpha, inv_alpha;
SSAVec4i output;
@ -532,23 +566,27 @@ void DrawTriangleCodegen::ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbu
break;
}
buffer.store_vec4ub(output);
}
else
{
return output;
}
SSAInt DrawTriangleCodegen::ProcessPixel8(SSAInt bg, SSAInt *varying)
{
SSAInt ufrac = varying[0];
SSAInt vfrac = varying[1];
SSAInt upos = ((ufrac >> 16) * textureWidth) >> 16;
SSAInt vpos = ((vfrac >> 16) * textureHeight) >> 16;
SSAInt uvoffset = upos * textureHeight + vpos;
if (variant == TriDrawVariant::FillNormal || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector)
{
buffer.store(color.trunc_ubyte());
return color;
}
else
{
SSAUByte fg = texturePixels[uvoffset].load(true);
buffer.store(fg);
return fg.zext_int();
}
}
if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector)
subsectorbuffer.store(subsectorDepth);
}
void DrawTriangleCodegen::SetStencilBlock(SSAInt block)

View file

@ -46,7 +46,8 @@ private:
void LoopFullBlock();
void LoopPartialBlock();
void ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbuffer, SSAInt *varying);
SSAVec4i ProcessPixel32(SSAVec4i bg, SSAInt *varying);
SSAInt ProcessPixel8(SSAInt bg, SSAInt *varying);
SSAVec4i TranslateSample(SSAInt uvoffset);
SSAVec4i Sample(SSAInt uvoffset);

View file

@ -127,7 +127,7 @@ LLVMDrawers *LLVMDrawers::Instance()
LLVMDrawersImpl::LLVMDrawersImpl()
{
int version = 1; // Increment this number if the drawer codegen is modified (forces recreation of the module).
int version = 2; // Increment this number if the drawer codegen is modified (forces recreation of the module).
std::string targetCPU = mProgram.GetTargetCPU();
bool loaded = mProgram.LoadCachedModule(version, targetCPU);
if (!loaded)

View file

@ -23,6 +23,7 @@
#include "r_compiler/llvm_include.h"
#include "ssa_bool.h"
#include "ssa_ubyte.h"
#include "ssa_vec4i.h"
#include "ssa_value.h"
#include "ssa_scope.h"
@ -61,6 +62,11 @@ SSAUByte SSABool::select(SSAUByte a, SSAUByte b)
return SSAValue::from_llvm(SSAScope::builder().CreateSelect(v, a.v, b.v, SSAScope::hint()));
}
SSAVec4i SSABool::select(SSAVec4i a, SSAVec4i b)
{
return SSAValue::from_llvm(SSAScope::builder().CreateSelect(v, a.v, b.v, SSAScope::hint()));
}
SSABool operator&&(const SSABool &a, const SSABool &b)
{
return SSABool::from_llvm(SSAScope::builder().CreateAnd(a.v, b.v, SSAScope::hint()));

View file

@ -29,6 +29,8 @@
namespace llvm { class Value; }
namespace llvm { class Type; }
class SSAVec4i;
class SSABool
{
public:
@ -41,6 +43,7 @@ public:
SSAInt zext_int();
SSAInt select(SSAInt a, SSAInt b);
SSAUByte select(SSAUByte a, SSAUByte b);
SSAVec4i select(SSAVec4i a, SSAVec4i b);
llvm::Value *v;
};