diff --git a/src/r_compiler/fixedfunction/drawtrianglecodegen.cpp b/src/r_compiler/fixedfunction/drawtrianglecodegen.cpp index 1ad98a8fb3..fbbca7517a 100644 --- a/src/r_compiler/fixedfunction/drawtrianglecodegen.cpp +++ b/src/r_compiler/fixedfunction/drawtrianglecodegen.cpp @@ -303,57 +303,80 @@ void DrawTriangleCodegen::LoopFullBlock() { int pixelsize = truecolor ? 4 : 1; - stack_iy.store(SSAInt(0)); - stack_buffer.store(dest[x * pixelsize]); - stack_subsectorbuffer.store(subsectorGBuffer[x]); - - SSAForBlock loopy; - SSAInt iy = stack_iy.load(); - SSAUBytePtr buffer = stack_buffer.load(); - SSAIntPtr subsectorbuffer = stack_subsectorbuffer.load(); - loopy.loop_block(iy < SSAInt(q), q); + for (int iy = 0; iy < q; iy++) { + SSAUBytePtr buffer = dest[(x + iy * pitch) * pixelsize]; + SSAIntPtr subsectorbuffer = subsectorGBuffer[x + iy * pitch]; + + SSAInt varying[TriVertex::NumVarying]; SSAInt varyingStep[TriVertex::NumVarying]; for (int i = 0; i < TriVertex::NumVarying; i++) { - stack_varying[i].store((varyingPos[i] + varyingStepPos[i] * iy) << 8); + varying[i] = (varyingPos[i] + varyingStepPos[i] * iy) << 8; varyingStep[i] = (varyingStartStepX[i] + varyingIncrStepX[i] * iy) << 8; } - stack_ix.store(SSAInt(0)); - SSAForBlock loopx; - SSAInt ix = stack_ix.load(); - SSAInt varying[TriVertex::NumVarying]; - for (int i = 0; i < TriVertex::NumVarying; i++) - varying[i] = stack_varying[i].load(); - loopx.loop_block(ix < SSAInt(q), q); + for (int ix = 0; ix < q; ix += 4) { - if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) + SSAUBytePtr buf = buffer[ix * pixelsize]; + if (truecolor) { - SSAIfBlock branch; - branch.if_block(subsectorbuffer[ix].load(true) >= subsectorDepth); + SSAVec16ub pixels16 = buf.load_unaligned_vec16ub(false); + SSAVec8s pixels8hi = SSAVec8s::extendhi(pixels16); + SSAVec8s pixels8lo = SSAVec8s::extendlo(pixels16); + SSAVec4i pixels[4] = { - ProcessPixel(buffer[ix * pixelsize], subsectorbuffer[ix], varying); + SSAVec4i::extendlo(pixels8lo), + SSAVec4i::extendhi(pixels8lo), + SSAVec4i::extendlo(pixels8hi), + SSAVec4i::extendhi(pixels8hi) + }; + + for (int sse = 0; sse < 4; sse++) + { + if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) + { + SSABool subsectorTest = subsectorbuffer[ix].load(true) >= subsectorDepth; + pixels[sse] = subsectorTest.select(ProcessPixel32(pixels[sse], varying), pixels[sse]); + } + else + { + pixels[sse] = ProcessPixel32(pixels[sse], varying); + } + + for (int i = 0; i < TriVertex::NumVarying; i++) + varying[i] = varying[i] + varyingStep[i]; } - branch.end_block(); + + buf.store_unaligned_vec16ub(SSAVec16ub(SSAVec8s(pixels[0], pixels[1]), SSAVec8s(pixels[2], pixels[3]))); } else { - ProcessPixel(buffer[ix * pixelsize], subsectorbuffer[ix], varying); + SSAVec4i pixels = buf.load_vec4ub(false); + + for (int sse = 0; sse < 4; sse++) + { + if (variant == TriDrawVariant::DrawSubsector || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) + { + SSABool subsectorTest = subsectorbuffer[ix].load(true) >= subsectorDepth; + pixels.insert(sse, subsectorTest.select(ProcessPixel8(pixels[sse], varying), pixels[sse])); + } + else + { + pixels.insert(sse, ProcessPixel8(pixels[sse], varying)); + } + + for (int i = 0; i < TriVertex::NumVarying; i++) + varying[i] = varying[i] + varyingStep[i]; + } + + buf.store_vec4ub(pixels); } - for (int i = 0; i < TriVertex::NumVarying; i++) - stack_varying[i].store(varying[i] + varyingStep[i]); - - stack_ix.store(ix + 1); + if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector) + subsectorbuffer[ix].store_unaligned_vec4i(SSAVec4i(subsectorDepth)); } - loopx.end_block(); - - stack_buffer.store(buffer[pitch * pixelsize]); - stack_subsectorbuffer.store(subsectorbuffer[pitch]); - stack_iy.store(iy + 1); } - loopy.end_block(); } if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector) @@ -425,7 +448,21 @@ void DrawTriangleCodegen::LoopPartialBlock() } else { - ProcessPixel(buffer[ix * pixelsize], subsectorbuffer[ix], varying); + SSAUBytePtr buf = buffer[ix * pixelsize]; + + if (truecolor) + { + SSAVec4i bg = buf.load_vec4ub(false); + buf.store_vec4ub(ProcessPixel32(bg, varying)); + } + else + { + SSAUByte bg = buf.load(false); + buf.store(ProcessPixel8(bg.zext_int(), varying).trunc_ubyte()); + } + + if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector) + subsectorbuffer[ix].store(subsectorDepth); } } branch.end_block(); @@ -466,7 +503,7 @@ SSAVec4i DrawTriangleCodegen::Sample(SSAInt uvoffset) return texturePixels[uvoffset * 4].load_vec4ub(true); } -void DrawTriangleCodegen::ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbuffer, SSAInt *varying) +SSAVec4i DrawTriangleCodegen::ProcessPixel32(SSAVec4i bg, SSAInt *varying) { SSAInt ufrac = varying[0]; SSAInt vfrac = varying[1]; @@ -475,80 +512,81 @@ void DrawTriangleCodegen::ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbu SSAInt vpos = ((vfrac >> 16) * textureHeight) >> 16; SSAInt uvoffset = upos * textureHeight + vpos; - if (truecolor) + SSAVec4i fg; + SSAInt alpha, inv_alpha; + SSAVec4i output; + + switch (blendmode) { - SSAVec4i fg; - SSAVec4i bg = buffer.load_vec4ub(false); - SSAInt alpha, inv_alpha; - SSAVec4i output; + default: + case TriBlendMode::Copy: + fg = Sample(uvoffset); + output = blend_copy(shade_bgra_simple(fg, currentlight)); break; + case TriBlendMode::AlphaBlend: + fg = Sample(uvoffset); + output = blend_alpha_blend(shade_bgra_simple(fg, currentlight), bg); break; + case TriBlendMode::AddSolid: + fg = Sample(uvoffset); + output = blend_add(shade_bgra_simple(fg, currentlight), bg, srcalpha, destalpha); break; + case TriBlendMode::Add: + fg = Sample(uvoffset); + output = blend_add(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); break; + case TriBlendMode::Sub: + fg = Sample(uvoffset); + output = blend_sub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); break; + case TriBlendMode::RevSub: + fg = Sample(uvoffset); + output = blend_revsub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); break; + case TriBlendMode::Shaded: + fg = Sample(uvoffset); + alpha = fg[0]; + alpha = alpha + (alpha >> 7); // 255 -> 256 + inv_alpha = 256 - alpha; + output = blend_add(shade_bgra_simple(SSAVec4i::unpack(color), currentlight), bg, alpha, inv_alpha); + break; + case TriBlendMode::TranslateCopy: + fg = TranslateSample(uvoffset); + output = blend_copy(shade_bgra_simple(fg, currentlight)); + break; + case TriBlendMode::TranslateAlphaBlend: + fg = TranslateSample(uvoffset); + output = blend_alpha_blend(shade_bgra_simple(fg, currentlight), bg); break; + break; + case TriBlendMode::TranslateAdd: + fg = TranslateSample(uvoffset); + output = blend_add(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); + break; + case TriBlendMode::TranslateSub: + fg = TranslateSample(uvoffset); + output = blend_sub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); + break; + case TriBlendMode::TranslateRevSub: + fg = TranslateSample(uvoffset); + output = blend_revsub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); + break; + } - switch (blendmode) - { - default: - case TriBlendMode::Copy: - fg = Sample(uvoffset); - output = blend_copy(shade_bgra_simple(fg, currentlight)); break; - case TriBlendMode::AlphaBlend: - fg = Sample(uvoffset); - output = blend_alpha_blend(shade_bgra_simple(fg, currentlight), bg); break; - case TriBlendMode::AddSolid: - fg = Sample(uvoffset); - output = blend_add(shade_bgra_simple(fg, currentlight), bg, srcalpha, destalpha); break; - case TriBlendMode::Add: - fg = Sample(uvoffset); - output = blend_add(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); break; - case TriBlendMode::Sub: - fg = Sample(uvoffset); - output = blend_sub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); break; - case TriBlendMode::RevSub: - fg = Sample(uvoffset); - output = blend_revsub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); break; - case TriBlendMode::Shaded: - fg = Sample(uvoffset); - alpha = fg[0]; - alpha = alpha + (alpha >> 7); // 255 -> 256 - inv_alpha = 256 - alpha; - output = blend_add(shade_bgra_simple(SSAVec4i::unpack(color), currentlight), bg, alpha, inv_alpha); - break; - case TriBlendMode::TranslateCopy: - fg = TranslateSample(uvoffset); - output = blend_copy(shade_bgra_simple(fg, currentlight)); - break; - case TriBlendMode::TranslateAlphaBlend: - fg = TranslateSample(uvoffset); - output = blend_alpha_blend(shade_bgra_simple(fg, currentlight), bg); break; - break; - case TriBlendMode::TranslateAdd: - fg = TranslateSample(uvoffset); - output = blend_add(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); - break; - case TriBlendMode::TranslateSub: - fg = TranslateSample(uvoffset); - output = blend_sub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); - break; - case TriBlendMode::TranslateRevSub: - fg = TranslateSample(uvoffset); - output = blend_revsub(shade_bgra_simple(fg, currentlight), bg, srcalpha, calc_blend_bgalpha(fg, destalpha)); - break; - } + return output; +} - buffer.store_vec4ub(output); +SSAInt DrawTriangleCodegen::ProcessPixel8(SSAInt bg, SSAInt *varying) +{ + SSAInt ufrac = varying[0]; + SSAInt vfrac = varying[1]; + + SSAInt upos = ((ufrac >> 16) * textureWidth) >> 16; + SSAInt vpos = ((vfrac >> 16) * textureHeight) >> 16; + SSAInt uvoffset = upos * textureHeight + vpos; + + if (variant == TriDrawVariant::FillNormal || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) + { + return color; } else { - if (variant == TriDrawVariant::FillNormal || variant == TriDrawVariant::FillSubsector || variant == TriDrawVariant::FuzzSubsector) - { - buffer.store(color.trunc_ubyte()); - } - else - { - SSAUByte fg = texturePixels[uvoffset].load(true); - buffer.store(fg); - } + SSAUByte fg = texturePixels[uvoffset].load(true); + return fg.zext_int(); } - - if (variant != TriDrawVariant::DrawSubsector && variant != TriDrawVariant::FillSubsector && variant != TriDrawVariant::FuzzSubsector) - subsectorbuffer.store(subsectorDepth); } void DrawTriangleCodegen::SetStencilBlock(SSAInt block) diff --git a/src/r_compiler/fixedfunction/drawtrianglecodegen.h b/src/r_compiler/fixedfunction/drawtrianglecodegen.h index d5539ef22f..e4c22d49cd 100644 --- a/src/r_compiler/fixedfunction/drawtrianglecodegen.h +++ b/src/r_compiler/fixedfunction/drawtrianglecodegen.h @@ -46,7 +46,8 @@ private: void LoopFullBlock(); void LoopPartialBlock(); - void ProcessPixel(SSAUBytePtr buffer, SSAIntPtr subsectorbuffer, SSAInt *varying); + SSAVec4i ProcessPixel32(SSAVec4i bg, SSAInt *varying); + SSAInt ProcessPixel8(SSAInt bg, SSAInt *varying); SSAVec4i TranslateSample(SSAInt uvoffset); SSAVec4i Sample(SSAInt uvoffset); diff --git a/src/r_compiler/llvmdrawers.cpp b/src/r_compiler/llvmdrawers.cpp index a3432eae65..913e5d9b78 100644 --- a/src/r_compiler/llvmdrawers.cpp +++ b/src/r_compiler/llvmdrawers.cpp @@ -127,7 +127,7 @@ LLVMDrawers *LLVMDrawers::Instance() LLVMDrawersImpl::LLVMDrawersImpl() { - int version = 1; // Increment this number if the drawer codegen is modified (forces recreation of the module). + int version = 2; // Increment this number if the drawer codegen is modified (forces recreation of the module). std::string targetCPU = mProgram.GetTargetCPU(); bool loaded = mProgram.LoadCachedModule(version, targetCPU); if (!loaded) diff --git a/src/r_compiler/ssa/ssa_bool.cpp b/src/r_compiler/ssa/ssa_bool.cpp index 65cc25c90c..916350c59d 100644 --- a/src/r_compiler/ssa/ssa_bool.cpp +++ b/src/r_compiler/ssa/ssa_bool.cpp @@ -23,6 +23,7 @@ #include "r_compiler/llvm_include.h" #include "ssa_bool.h" #include "ssa_ubyte.h" +#include "ssa_vec4i.h" #include "ssa_value.h" #include "ssa_scope.h" @@ -61,6 +62,11 @@ SSAUByte SSABool::select(SSAUByte a, SSAUByte b) return SSAValue::from_llvm(SSAScope::builder().CreateSelect(v, a.v, b.v, SSAScope::hint())); } +SSAVec4i SSABool::select(SSAVec4i a, SSAVec4i b) +{ + return SSAValue::from_llvm(SSAScope::builder().CreateSelect(v, a.v, b.v, SSAScope::hint())); +} + SSABool operator&&(const SSABool &a, const SSABool &b) { return SSABool::from_llvm(SSAScope::builder().CreateAnd(a.v, b.v, SSAScope::hint())); diff --git a/src/r_compiler/ssa/ssa_bool.h b/src/r_compiler/ssa/ssa_bool.h index 2ed6e7d4a6..372c626c04 100644 --- a/src/r_compiler/ssa/ssa_bool.h +++ b/src/r_compiler/ssa/ssa_bool.h @@ -29,6 +29,8 @@ namespace llvm { class Value; } namespace llvm { class Type; } +class SSAVec4i; + class SSABool { public: @@ -41,6 +43,7 @@ public: SSAInt zext_int(); SSAInt select(SSAInt a, SSAInt b); SSAUByte select(SSAUByte a, SSAUByte b); + SSAVec4i select(SSAVec4i a, SSAVec4i b); llvm::Value *v; };