From e3b3b7a4ce39587ae595c38fbe5fe225c0c311fb Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Thu, 1 Dec 2016 10:52:53 +0100 Subject: [PATCH] Masked store experiment (which turned out to be much slower) --- .../fixedfunction/drawtrianglecodegen.cpp | 125 ++++++++++++++++++ tools/drawergen/ssa/ssa_float_ptr.cpp | 2 +- tools/drawergen/ssa/ssa_int_ptr.cpp | 22 ++- tools/drawergen/ssa/ssa_int_ptr.h | 3 + tools/drawergen/ssa/ssa_ubyte_ptr.cpp | 52 +++++++- tools/drawergen/ssa/ssa_ubyte_ptr.h | 4 + tools/drawergen/ssa/ssa_vec4f_ptr.cpp | 2 +- 7 files changed, 206 insertions(+), 4 deletions(-) diff --git a/tools/drawergen/fixedfunction/drawtrianglecodegen.cpp b/tools/drawergen/fixedfunction/drawtrianglecodegen.cpp index 69aacbbef4..e29e2a1223 100644 --- a/tools/drawergen/fixedfunction/drawtrianglecodegen.cpp +++ b/tools/drawergen/fixedfunction/drawtrianglecodegen.cpp @@ -541,6 +541,131 @@ void DrawTriangleCodegen::LoopPartialBlock() loopy.end_block(); } +#if 0 +void DrawTriangleCodegen::LoopMaskedStoreBlock() +{ + if (variant == TriDrawVariant::Stencil) + { + } + else if (variant == TriDrawVariant::StencilClose) + { + } + else + { + int pixelsize = truecolor ? 4 : 1; + + AffineW = posx_w; + for (int i = 0; i < TriVertex::NumVarying; i++) + AffineVaryingPosY[i] = posx_varying[i]; + + SSAInt CY1 = C1 + DX12 * y0 - DY12 * x0; + SSAInt CY2 = C2 + DX23 * y0 - DY23 * x0; + SSAInt CY3 = C3 + DX31 * y0 - DY31 * x0; + + for (int iy = 0; iy < q; iy++) + { + SSAUBytePtr buffer = dest[(x + iy * pitch) * pixelsize]; + SSAIntPtr subsectorbuffer = subsectorGBuffer[x + iy * pitch]; + + SetupAffineBlock(); + + SSAInt CX1 = CY1; + SSAInt CX2 = CY2; + SSAInt CX3 = CY3; + + for (int ix = 0; ix < q; ix += 4) + { + SSABool covered[4]; + for (int maskindex = 0; maskindex < 4; maskindex++) + { + covered[maskindex] = CX1 > SSAInt(0) && CX2 > SSAInt(0) && CX3 > SSAInt(0); + + if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) + { + auto xx = SSAInt(ix + maskindex); + auto yy = SSAInt(iy); + covered[maskindex] = covered[maskindex] && SSABool::compare_uge(StencilGet(xx, yy), stencilTestValue) && subsectorbuffer[ix + maskindex].load(true) >= subsectorDepth; + } + else if (variant == TriDrawVariant::StencilClose) + { + auto xx = SSAInt(ix + maskindex); + auto yy = SSAInt(iy); + covered[maskindex] = covered[maskindex] && SSABool::compare_uge(StencilGet(xx, yy), stencilTestValue); + } + else + { + auto xx = SSAInt(ix + maskindex); + auto yy = SSAInt(iy); + covered[maskindex] = covered[maskindex] && StencilGet(xx, yy) == stencilTestValue; + } + + CX1 = CX1 - FDY12; + CX2 = CX2 - FDY23; + CX3 = CX3 - FDY31; + } + + SSAUBytePtr buf = buffer[ix * pixelsize]; + if (truecolor) + { + SSAVec16ub pixels16 = buf.load_unaligned_vec16ub(false); + SSAVec8s pixels8hi = SSAVec8s::extendhi(pixels16); + SSAVec8s pixels8lo = SSAVec8s::extendlo(pixels16); + SSAVec4i pixels[4] = + { + SSAVec4i::extendlo(pixels8lo), + SSAVec4i::extendhi(pixels8lo), + SSAVec4i::extendlo(pixels8hi), + SSAVec4i::extendhi(pixels8hi) + }; + + for (int sse = 0; sse < 4; sse++) + { + pixels[sse] = ProcessPixel32(pixels[sse], AffineVaryingPosX); + + for (int i = 0; i < TriVertex::NumVarying; i++) + AffineVaryingPosX[i] = AffineVaryingPosX[i] + AffineVaryingStepX[i]; + } + + buf.store_masked_vec16ub(SSAVec16ub(SSAVec8s(pixels[0], pixels[1]), SSAVec8s(pixels[2], pixels[3])), covered); + } + else + { + SSAVec4i pixelsvec = buf.load_vec4ub(false); + SSAInt pixels[4] = + { + pixelsvec[0], + pixelsvec[1], + pixelsvec[2], + pixelsvec[3] + }; + + for (int sse = 0; sse < 4; sse++) + { + pixels[sse] = ProcessPixel8(pixels[sse], AffineVaryingPosX); + + for (int i = 0; i < TriVertex::NumVarying; i++) + AffineVaryingPosX[i] = AffineVaryingPosX[i] + AffineVaryingStepX[i]; + } + + buf.store_masked_vec4ub(SSAVec4i(pixels[0], pixels[1], pixels[2], pixels[3]), covered); + } + + if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector) + subsectorbuffer[ix].store_masked_vec4i(SSAVec4i(subsectorDepth), covered); + } + + AffineW = AffineW + gradWY; + for (int i = 0; i < TriVertex::NumVarying; i++) + AffineVaryingPosY[i] = AffineVaryingPosY[i] + gradVaryingY[i]; + + CY1 = CY1 + FDX12; + CY2 = CY2 + FDX23; + CY3 = CY3 + FDX31; + } + } +} +#endif + SSAVec4i DrawTriangleCodegen::TranslateSample32(SSAInt uvoffset) { if (variant == TriDrawVariant::FillNormal || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) diff --git a/tools/drawergen/ssa/ssa_float_ptr.cpp b/tools/drawergen/ssa/ssa_float_ptr.cpp index 50507884b0..731fbbef8e 100644 --- a/tools/drawergen/ssa/ssa_float_ptr.cpp +++ b/tools/drawergen/ssa/ssa_float_ptr.cpp @@ -86,6 +86,6 @@ void SSAFloatPtr::store_vec4f(const SSAVec4f &new_value) void SSAFloatPtr::store_unaligned_vec4f(const SSAVec4f &new_value) { llvm::PointerType *m4xfloattypeptr = llvm::VectorType::get(llvm::Type::getFloatTy(SSAScope::context()), 4)->getPointerTo(); - auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xfloattypeptr, SSAScope::hint()), 1); + auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xfloattypeptr, SSAScope::hint()), 4); inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); } diff --git a/tools/drawergen/ssa/ssa_int_ptr.cpp b/tools/drawergen/ssa/ssa_int_ptr.cpp index 9b8b44501f..5f60f73589 100644 --- a/tools/drawergen/ssa/ssa_int_ptr.cpp +++ b/tools/drawergen/ssa/ssa_int_ptr.cpp @@ -23,6 +23,7 @@ #include "precomp.h" #include "ssa_int_ptr.h" #include "ssa_scope.h" +#include "ssa_bool.h" SSAIntPtr::SSAIntPtr() : v(0) @@ -86,6 +87,25 @@ void SSAIntPtr::store_vec4i(const SSAVec4i &new_value) void SSAIntPtr::store_unaligned_vec4i(const SSAVec4i &new_value) { llvm::PointerType *m4xint32typeptr = llvm::VectorType::get(llvm::Type::getInt32Ty(SSAScope::context()), 4)->getPointerTo(); - auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xint32typeptr, SSAScope::hint()), 1); + auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xint32typeptr, SSAScope::hint()), 4); + inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); +} + +void SSAIntPtr::store_masked_vec4i(const SSAVec4i &new_value, SSABool mask[4]) +{ + // Create mask vector + std::vector maskconstants; + maskconstants.resize(4, llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(1, 0, false))); + llvm::Value *maskValue = llvm::ConstantVector::get(maskconstants); +#if LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9) + for (int i = 0; i < 4; i++) + maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, SSAInt(i).v, SSAScope::hint()); +#else + for (int i = 0; i < 4; i++) + maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, (uint64_t)i, SSAScope::hint()); +#endif + + llvm::PointerType *m4xint32typeptr = llvm::VectorType::get(llvm::Type::getInt32Ty(SSAScope::context()), 4)->getPointerTo(); + auto inst = SSAScope::builder().CreateMaskedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m4xint32typeptr, SSAScope::hint()), 1, maskValue); inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); } diff --git a/tools/drawergen/ssa/ssa_int_ptr.h b/tools/drawergen/ssa/ssa_int_ptr.h index 04075c15eb..da6ecf168b 100644 --- a/tools/drawergen/ssa/ssa_int_ptr.h +++ b/tools/drawergen/ssa/ssa_int_ptr.h @@ -29,6 +29,8 @@ namespace llvm { class Value; } namespace llvm { class Type; } +class SSABool; + class SSAIntPtr { public: @@ -44,6 +46,7 @@ public: void store(const SSAInt &new_value); void store_vec4i(const SSAVec4i &new_value); void store_unaligned_vec4i(const SSAVec4i &new_value); + void store_masked_vec4i(const SSAVec4i &new_value, SSABool mask[4]); llvm::Value *v; }; diff --git a/tools/drawergen/ssa/ssa_ubyte_ptr.cpp b/tools/drawergen/ssa/ssa_ubyte_ptr.cpp index c6b835a1cd..bde0b5b643 100644 --- a/tools/drawergen/ssa/ssa_ubyte_ptr.cpp +++ b/tools/drawergen/ssa/ssa_ubyte_ptr.cpp @@ -23,6 +23,7 @@ #include "precomp.h" #include "ssa_ubyte_ptr.h" #include "ssa_scope.h" +#include "ssa_bool.h" SSAUBytePtr::SSAUBytePtr() : v(0) @@ -104,6 +105,37 @@ void SSAUBytePtr::store_vec4ub(const SSAVec4i &new_value) inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); } +void SSAUBytePtr::store_masked_vec4ub(const SSAVec4i &new_value, SSABool mask[4]) +{ + // Store using saturate: + SSAVec8s v8s(new_value, new_value); + SSAVec16ub v16ub(v8s, v8s); + + // Create mask vector + std::vector maskconstants; + maskconstants.resize(4, llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(1, 0, false))); + llvm::Value *maskValue = llvm::ConstantVector::get(maskconstants); +#if LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9) + for (int i = 0; i < 4; i++) + maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, SSAInt(i).v, SSAScope::hint()); +#else + for (int i = 0; i < 4; i++) + maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i].v, (uint64_t)i, SSAScope::hint()); +#endif + + llvm::Type *m16xint8type = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16); + llvm::PointerType *m4xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 4)->getPointerTo(); + std::vector constants; + constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 0))); + constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 1))); + constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 2))); + constants.push_back(llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(32, 3))); + llvm::Value *shufflemask = llvm::ConstantVector::get(constants); + llvm::Value *val_vector = SSAScope::builder().CreateShuffleVector(v16ub.v, llvm::UndefValue::get(m16xint8type), shufflemask, SSAScope::hint()); + llvm::CallInst *inst = SSAScope::builder().CreateMaskedStore(val_vector, SSAScope::builder().CreateBitCast(v, m4xint8typeptr, SSAScope::hint()), 1, maskValue); + inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); +} + void SSAUBytePtr::store_vec16ub(const SSAVec16ub &new_value) { llvm::PointerType *m16xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16)->getPointerTo(); @@ -118,6 +150,24 @@ void SSAUBytePtr::store_vec16ub(const SSAVec16ub &new_value) void SSAUBytePtr::store_unaligned_vec16ub(const SSAVec16ub &new_value) { llvm::PointerType *m16xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16)->getPointerTo(); - llvm::StoreInst *inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m16xint8typeptr, SSAScope::hint()), 1); + llvm::StoreInst *inst = SSAScope::builder().CreateAlignedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m16xint8typeptr, SSAScope::hint()), 4); + inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); +} + +void SSAUBytePtr::store_masked_vec16ub(const SSAVec16ub &new_value, SSABool mask[4]) +{ + std::vector constants; + constants.resize(16, llvm::ConstantInt::get(SSAScope::context(), llvm::APInt(1, 0, false))); + llvm::Value *maskValue = llvm::ConstantVector::get(constants); +#if LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9) + for (int i = 0; i < 16; i++) + maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i / 4].v, SSAInt(i).v, SSAScope::hint()); +#else + for (int i = 0; i < 16; i++) + maskValue = SSAScope::builder().CreateInsertElement(maskValue, mask[i / 4].v, (uint64_t)i, SSAScope::hint()); +#endif + + llvm::PointerType *m16xint8typeptr = llvm::VectorType::get(llvm::Type::getInt8Ty(SSAScope::context()), 16)->getPointerTo(); + llvm::CallInst *inst = SSAScope::builder().CreateMaskedStore(new_value.v, SSAScope::builder().CreateBitCast(v, m16xint8typeptr, SSAScope::hint()), 1, maskValue); inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); } diff --git a/tools/drawergen/ssa/ssa_ubyte_ptr.h b/tools/drawergen/ssa/ssa_ubyte_ptr.h index b4567597e2..ba4fb5397e 100644 --- a/tools/drawergen/ssa/ssa_ubyte_ptr.h +++ b/tools/drawergen/ssa/ssa_ubyte_ptr.h @@ -31,6 +31,8 @@ namespace llvm { class Value; } namespace llvm { class Type; } +class SSABool; + class SSAUBytePtr { public: @@ -46,8 +48,10 @@ public: SSAVec16ub load_unaligned_vec16ub(bool constantScopeDomain) const; void store(const SSAUByte &new_value); void store_vec4ub(const SSAVec4i &new_value); + void store_masked_vec4ub(const SSAVec4i &new_value, SSABool mask[4]); void store_vec16ub(const SSAVec16ub &new_value); void store_unaligned_vec16ub(const SSAVec16ub &new_value); + void store_masked_vec16ub(const SSAVec16ub &new_value, SSABool mask[4]); llvm::Value *v; }; diff --git a/tools/drawergen/ssa/ssa_vec4f_ptr.cpp b/tools/drawergen/ssa/ssa_vec4f_ptr.cpp index 8cdee930db..866331f840 100644 --- a/tools/drawergen/ssa/ssa_vec4f_ptr.cpp +++ b/tools/drawergen/ssa/ssa_vec4f_ptr.cpp @@ -68,6 +68,6 @@ void SSAVec4fPtr::store(const SSAVec4f &new_value) void SSAVec4fPtr::store_unaligned(const SSAVec4f &new_value) { - auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, v, 1, false); + auto inst = SSAScope::builder().CreateAlignedStore(new_value.v, v, 4, false); inst->setMetadata(llvm::LLVMContext::MD_noalias, SSAScope::constant_scope_list()); }