Masked store experiment (which turned out to be much slower)

This commit is contained in:
Magnus Norddahl 2016-12-01 10:52:53 +01:00
parent c89e023bd5
commit e3b3b7a4ce
7 changed files with 206 additions and 4 deletions

View file

@ -541,6 +541,131 @@ void DrawTriangleCodegen::LoopPartialBlock()
loopy.end_block();
}
#if 0
void DrawTriangleCodegen::LoopMaskedStoreBlock()
{
if (variant == TriDrawVariant::Stencil)
{
}
else if (variant == TriDrawVariant::StencilClose)
{
}
else
{
int pixelsize = truecolor ? 4 : 1;
AffineW = posx_w;
for (int i = 0; i < TriVertex::NumVarying; i++)
AffineVaryingPosY[i] = posx_varying[i];
SSAInt CY1 = C1 + DX12 * y0 - DY12 * x0;
SSAInt CY2 = C2 + DX23 * y0 - DY23 * x0;
SSAInt CY3 = C3 + DX31 * y0 - DY31 * x0;
for (int iy = 0; iy < q; iy++)
{
SSAUBytePtr buffer = dest[(x + iy * pitch) * pixelsize];
SSAIntPtr subsectorbuffer = subsectorGBuffer[x + iy * pitch];
SetupAffineBlock();
SSAInt CX1 = CY1;
SSAInt CX2 = CY2;
SSAInt CX3 = CY3;
for (int ix = 0; ix < q; ix += 4)
{
SSABool covered[4];
for (int maskindex = 0; maskindex < 4; maskindex++)
{
covered[maskindex] = CX1 > SSAInt(0) && CX2 > SSAInt(0) && CX3 > SSAInt(0);
if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector)
{
auto xx = SSAInt(ix + maskindex);
auto yy = SSAInt(iy);
covered[maskindex] = covered[maskindex] && SSABool::compare_uge(StencilGet(xx, yy), stencilTestValue) && subsectorbuffer[ix + maskindex].load(true) >= subsectorDepth;
}
else if (variant == TriDrawVariant::StencilClose)
{
auto xx = SSAInt(ix + maskindex);
auto yy = SSAInt(iy);
covered[maskindex] = covered[maskindex] && SSABool::compare_uge(StencilGet(xx, yy), stencilTestValue);
}
else
{
auto xx = SSAInt(ix + maskindex);
auto yy = SSAInt(iy);
covered[maskindex] = covered[maskindex] && StencilGet(xx, yy) == stencilTestValue;
}
CX1 = CX1 - FDY12;
CX2 = CX2 - FDY23;
CX3 = CX3 - FDY31;
}
SSAUBytePtr buf = buffer[ix * pixelsize];
if (truecolor)
{
SSAVec16ub pixels16 = buf.load_unaligned_vec16ub(false);
SSAVec8s pixels8hi = SSAVec8s::extendhi(pixels16);
SSAVec8s pixels8lo = SSAVec8s::extendlo(pixels16);
SSAVec4i pixels[4] =
{
SSAVec4i::extendlo(pixels8lo),
SSAVec4i::extendhi(pixels8lo),
SSAVec4i::extendlo(pixels8hi),
SSAVec4i::extendhi(pixels8hi)
};
for (int sse = 0; sse < 4; sse++)
{
pixels[sse] = ProcessPixel32(pixels[sse], AffineVaryingPosX);
for (int i = 0; i < TriVertex::NumVarying; i++)
AffineVaryingPosX[i] = AffineVaryingPosX[i] + AffineVaryingStepX[i];
}
buf.store_masked_vec16ub(SSAVec16ub(SSAVec8s(pixels[0], pixels[1]), SSAVec8s(pixels[2], pixels[3])), covered);
}
else
{
SSAVec4i pixelsvec = buf.load_vec4ub(false);
SSAInt pixels[4] =
{
pixelsvec[0],
pixelsvec[1],
pixelsvec[2],
pixelsvec[3]
};
for (int sse = 0; sse < 4; sse++)
{
pixels[sse] = ProcessPixel8(pixels[sse], AffineVaryingPosX);
for (int i = 0; i < TriVertex::NumVarying; i++)
AffineVaryingPosX[i] = AffineVaryingPosX[i] + AffineVaryingStepX[i];
}
buf.store_masked_vec4ub(SSAVec4i(pixels[0], pixels[1], pixels[2], pixels[3]), covered);
}
if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector)
subsectorbuffer[ix].store_masked_vec4i(SSAVec4i(subsectorDepth), covered);
}
AffineW = AffineW + gradWY;
for (int i = 0; i < TriVertex::NumVarying; i++)
AffineVaryingPosY[i] = AffineVaryingPosY[i] + gradVaryingY[i];
CY1 = CY1 + FDX12;
CY2 = CY2 + FDX23;
CY3 = CY3 + FDX31;
}
}
}
#endif
SSAVec4i DrawTriangleCodegen::TranslateSample32(SSAInt uvoffset)
{
if (variant == TriDrawVariant::FillNormal || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector)

View file

@ -86,6 +86,6 @@ void SSAFloatPtr::store_vec4f(const SSAVec4f &new_value)
void SSAFloatPtr::store_unaligned_vec4f(const SSAVec4f &new_value)
{
llvm::PointerType *m4xfloattypeptr = llvm::VectorType::get(llvm::Type::getFloatTy(SSAScope::context()), 4)->getPointerTo();
auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xfloattypeptr, SSAScope::hint()), 1);
auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xfloattypeptr, SSAScope::hint()), 4);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}

View file

@ -23,6 +23,7 @@
#include "precomp.h"
#include "ssa_int_ptr.h"
#include "ssa_scope.h"
#include "ssa_bool.h"
SSAIntPtr::SSAIntPtr()
: v(0)
@ -86,6 +87,25 @@ void SSAIntPtr::store_vec4i(const SSAVec4i &new_value)
void SSAIntPtr::store_unaligned_vec4i(const SSAVec4i &new_value)
{
llvm::PointerType *m4xint32typeptr = llvm::VectorType::get(llvm::Type::getInt32Ty(SSAScope::context()), 4)->getPointerTo();
auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xint32typeptr, SSAScope::hint()), 1);
auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xint32typeptr, SSAScope::hint()), 4);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}
void SSAIntPtr::store_masked_vec4i(const SSAVec4i &new_value, SSABool mask[4])
{
// Create mask vector
std::vector<llvm::Constant*> maskconstants;
maskconstants.resize(4, llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(1, 0, false)));
llvm::Value *maskValue = llvm::ConstantVector::get(maskconstants);
#if LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)
for (int i = 0; i < 4; i++)
maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, SSAInt(i).v, SSAScope::hint());
#else
for (int i = 0; i < 4; i++)
maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, (uint64_t)i, SSAScope::hint());
#endif
llvm::PointerType *m4xint32typeptr = llvm::VectorType::get(llvm::Type::getInt32Ty(SSAScope::context()), 4)->getPointerTo();
auto inst = SSAScope::builder().CreateMaskedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xint32typeptr, SSAScope::hint()), 1, maskValue);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}

View file

@ -29,6 +29,8 @@
namespace llvm { class Value; }
namespace llvm { class Type; }
class SSABool;
class SSAIntPtr
{
public:
@ -44,6 +46,7 @@ public:
void store(const SSAInt &new_value);
void store_vec4i(const SSAVec4i &new_value);
void store_unaligned_vec4i(const SSAVec4i &new_value);
void store_masked_vec4i(const SSAVec4i &new_value, SSABool mask[4]);
llvm::Value *v;
};

View file

@ -23,6 +23,7 @@
#include "precomp.h"
#include "ssa_ubyte_ptr.h"
#include "ssa_scope.h"
#include "ssa_bool.h"
SSAUBytePtr::SSAUBytePtr()
: v(0)
@ -104,6 +105,37 @@ void SSAUBytePtr::store_vec4ub(const SSAVec4i &new_value)
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}
void SSAUBytePtr::store_masked_vec4ub(const SSAVec4i &new_value, SSABool mask[4])
{
// Store using saturate:
SSAVec8s v8s(new_value, new_value);
SSAVec16ub v16ub(v8s, v8s);
// Create mask vector
std::vector<llvm::Constant*> maskconstants;
maskconstants.resize(4, llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(1, 0, false)));
llvm::Value *maskValue = llvm::ConstantVector::get(maskconstants);
#if LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)
for (int i = 0; i < 4; i++)
maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, SSAInt(i).v, SSAScope::hint());
#else
for (int i = 0; i < 4; i++)
maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, (uint64_t)i, SSAScope::hint());
#endif
llvm::Type *m16xint8type = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16);
llvm::PointerType *m4xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 4)->getPointerTo();
std::vector<llvm::Constant*> constants;
constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 0)));
constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 1)));
constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 2)));
constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 3)));
llvm::Value *shufflemask = llvm::ConstantVector::get(constants);
llvm::Value *val_vector = SSAScope::builder().CreateShuffleVector(v16ub.v, llvm::UndefValue::get(m16xint8type), shufflemask, SSAScope::hint());
llvm::CallInst *inst = SSAScope::builder().CreateMaskedStore(val_vector, SSAScope::builder().CreateBitCast(v, m4xint8typeptr, SSAScope::hint()), 1, maskValue);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}
void SSAUBytePtr::store_vec16ub(const SSAVec16ub &new_value)
{
llvm::PointerType *m16xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16)->getPointerTo();
@ -118,6 +150,24 @@ void SSAUBytePtr::store_vec16ub(const SSAVec16ub &new_value)
void SSAUBytePtr::store_unaligned_vec16ub(const SSAVec16ub &new_value)
{
llvm::PointerType *m16xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16)->getPointerTo();
llvm::StoreInst *inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m16xint8typeptr, SSAScope::hint()), 1);
llvm::StoreInst *inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m16xint8typeptr, SSAScope::hint()), 4);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}
void SSAUBytePtr::store_masked_vec16ub(const SSAVec16ub &new_value, SSABool mask[4])
{
std::vector<llvm::Constant*> constants;
constants.resize(16, llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(1, 0, false)));
llvm::Value *maskValue = llvm::ConstantVector::get(constants);
#if LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)
for (int i = 0; i < 16; i++)
maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i / 4].v, SSAInt(i).v, SSAScope::hint());
#else
for (int i = 0; i < 16; i++)
maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i / 4].v, (uint64_t)i, SSAScope::hint());
#endif
llvm::PointerType *m16xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16)->getPointerTo();
llvm::CallInst *inst = SSAScope::builder().CreateMaskedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m16xint8typeptr, SSAScope::hint()), 1, maskValue);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}

View file

@ -31,6 +31,8 @@
namespace llvm { class Value; }
namespace llvm { class Type; }
class SSABool;
class SSAUBytePtr
{
public:
@ -46,8 +48,10 @@ public:
SSAVec16ub load_unaligned_vec16ub(bool constantScopeDomain) const;
void store(const SSAUByte &new_value);
void store_vec4ub(const SSAVec4i &new_value);
void store_masked_vec4ub(const SSAVec4i &new_value, SSABool mask[4]);
void store_vec16ub(const SSAVec16ub &new_value);
void store_unaligned_vec16ub(const SSAVec16ub &new_value);
void store_masked_vec16ub(const SSAVec16ub &new_value, SSABool mask[4]);
llvm::Value *v;
};

View file

@ -68,6 +68,6 @@ void SSAVec4fPtr::store(const SSAVec4f &new_value)
void SSAVec4fPtr::store_unaligned(const SSAVec4f &new_value)
{
auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, v, 1, false);
auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, v, 4, false);
inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list());
}