diff --git a/CMakeLists.txt b/CMakeLists.txt index fc22ca194..277fec59c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ if( COMMAND cmake_policy ) endif() endif() -list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ) +list( APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake ) include( CreateLaunchers ) include( FindPackageHandleStandardArgs ) @@ -148,8 +148,6 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE ) set( PROFILE 0 CACHE BOOL "Enable profiling with gprof for Debug and RelWithDebInfo build types." ) endif() -set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}") - option( NO_OPENAL "Disable OpenAL sound support" OFF ) find_package( BZip2 ) diff --git a/CleanDirectoryList.cmake b/cmake/CleanDirectoryList.cmake similarity index 100% rename from CleanDirectoryList.cmake rename to cmake/CleanDirectoryList.cmake diff --git a/CreateLaunchers.cmake b/cmake/CreateLaunchers.cmake similarity index 100% rename from CreateLaunchers.cmake rename to cmake/CreateLaunchers.cmake diff --git a/FindFluidSynth.cmake b/cmake/FindFluidSynth.cmake similarity index 100% rename from FindFluidSynth.cmake rename to cmake/FindFluidSynth.cmake diff --git a/FindMPG123.cmake b/cmake/FindMPG123.cmake similarity index 100% rename from FindMPG123.cmake rename to cmake/FindMPG123.cmake diff --git a/FindSDL2.cmake b/cmake/FindSDL2.cmake similarity index 100% rename from FindSDL2.cmake rename to cmake/FindSDL2.cmake diff --git a/FindSndFile.cmake b/cmake/FindSndFile.cmake similarity index 100% rename from FindSndFile.cmake rename to cmake/FindSndFile.cmake diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake new file mode 100644 index 000000000..198aa9c05 --- /dev/null +++ b/cmake/TargetArch.cmake @@ -0,0 +1,157 @@ + +# Copyright (c) 2012 Petroules Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY +# OF SUCH DAMAGE. + +# Based on the Qt 5 processor detection code, so should be very accurate +# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h +# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64) + +# Regarding POWER/PowerPC, just as is noted in the Qt source, +# "There are many more known variants/revisions that we do not handle/detect." + +set(archdetect_c_code " +#if defined(__arm__) || defined(__TARGET_ARCH_ARM) + #if defined(__ARM_ARCH_7__) \\ + || defined(__ARM_ARCH_7A__) \\ + || defined(__ARM_ARCH_7R__) \\ + || defined(__ARM_ARCH_7M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7) + #error cmake_ARCH armv7 + #elif defined(__ARM_ARCH_6__) \\ + || defined(__ARM_ARCH_6J__) \\ + || defined(__ARM_ARCH_6T2__) \\ + || defined(__ARM_ARCH_6Z__) \\ + || defined(__ARM_ARCH_6K__) \\ + || defined(__ARM_ARCH_6ZK__) \\ + || defined(__ARM_ARCH_6M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6) + #error cmake_ARCH armv6 + #elif defined(__ARM_ARCH_5TEJ__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5) + #error cmake_ARCH armv5 + #else + #error cmake_ARCH arm + #endif +#elif defined(__i386) || defined(__i386__) || defined(_M_IX86) + #error cmake_ARCH i386 +#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) + #error cmake_ARCH x86_64 +#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + #error cmake_ARCH ia64 +#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\ + || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\ + || defined(_M_MPPC) || defined(_M_PPC) + #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__) + #error cmake_ARCH ppc64 + #else + #error cmake_ARCH ppc + #endif +#endif + +#error cmake_ARCH unknown +") + +# Set ppc_support to TRUE before including this file or ppc and ppc64 +# will be treated as invalid architectures since they are no longer supported by Apple + +function(target_architecture output_var) + if(APPLE AND CMAKE_OSX_ARCHITECTURES) + # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set + # First let's normalize the order of the values + + # Note that it's not possible to compile PowerPC applications if you are using + # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we + # disable it by default + # See this page for more information: + # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4 + + # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime. + # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise. + + foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES}) + if("${osx_arch}" STREQUAL "ppc" AND ppc_support) + set(osx_arch_ppc TRUE) + elseif("${osx_arch}" STREQUAL "i386") + set(osx_arch_i386 TRUE) + elseif("${osx_arch}" STREQUAL "x86_64") + set(osx_arch_x86_64 TRUE) + elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support) + set(osx_arch_ppc64 TRUE) + else() + message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}") + endif() + endforeach() + + # Now add all the architectures in our normalized order + if(osx_arch_ppc) + list(APPEND ARCH ppc) + endif() + + if(osx_arch_i386) + list(APPEND ARCH i386) + endif() + + if(osx_arch_x86_64) + list(APPEND ARCH x86_64) + endif() + + if(osx_arch_ppc64) + list(APPEND ARCH ppc64) + endif() + else() + file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}") + + enable_language(C) + + # Detect the architecture in a rather creative way... + # This compiles a small C program which is a series of ifdefs that selects a + # particular #error preprocessor directive whose message string contains the + # target architecture. The program will always fail to compile (both because + # file is not a valid C program, and obviously because of the presence of the + # #error preprocessor directives... but by exploiting the preprocessor in this + # way, we can detect the correct target architecture even when cross-compiling, + # since the program itself never needs to be run (only the compiler/preprocessor) + try_run( + run_result_unused + compile_result_unused + "${CMAKE_BINARY_DIR}" + "${CMAKE_BINARY_DIR}/arch.c" + COMPILE_OUTPUT_VARIABLE ARCH + CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} + ) + + # Parse the architecture name from the compiler output + string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}") + + # Get rid of the value marker leaving just the architecture name + string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}") + + # If we are compiling with an unknown architecture this variable should + # already be set to "unknown" but in the case that it's empty (i.e. due + # to a typo in the code), then set it to unknown + if (NOT ARCH) + set(ARCH unknown) + endif() + endif() + + set(${output_var} "${ARCH}" PARENT_SCOPE) +endfunction() diff --git a/launcher-templates/genericlauncher.cmd.in b/cmake/launcher-templates/genericlauncher.cmd.in similarity index 100% rename from launcher-templates/genericlauncher.cmd.in rename to cmake/launcher-templates/genericlauncher.cmd.in diff --git a/launcher-templates/launcher.env.cmd.in b/cmake/launcher-templates/launcher.env.cmd.in similarity index 100% rename from launcher-templates/launcher.env.cmd.in rename to cmake/launcher-templates/launcher.env.cmd.in diff --git a/launcher-templates/perconfig.vcproj.user.in b/cmake/launcher-templates/perconfig.vcproj.user.in similarity index 100% rename from launcher-templates/perconfig.vcproj.user.in rename to cmake/launcher-templates/perconfig.vcproj.user.in diff --git a/launcher-templates/perconfig.vcxproj.user.in b/cmake/launcher-templates/perconfig.vcxproj.user.in similarity index 100% rename from launcher-templates/perconfig.vcxproj.user.in rename to cmake/launcher-templates/perconfig.vcxproj.user.in diff --git a/launcher-templates/targetlauncher.cmd.in b/cmake/launcher-templates/targetlauncher.cmd.in similarity index 100% rename from launcher-templates/targetlauncher.cmd.in rename to cmake/launcher-templates/targetlauncher.cmd.in diff --git a/launcher-templates/vcproj.user.in b/cmake/launcher-templates/vcproj.user.in similarity index 100% rename from launcher-templates/vcproj.user.in rename to cmake/launcher-templates/vcproj.user.in diff --git a/launcher-templates/vcxproj.user.in b/cmake/launcher-templates/vcxproj.user.in similarity index 100% rename from launcher-templates/vcxproj.user.in rename to cmake/launcher-templates/vcxproj.user.in diff --git a/precompiled_headers.cmake b/cmake/precompiled_headers.cmake similarity index 100% rename from precompiled_headers.cmake rename to cmake/precompiled_headers.cmake diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9d7582e9f..0e5f75837 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required( VERSION 2.8.7 ) -include(../precompiled_headers.cmake) +include(precompiled_headers) if( COMMAND cmake_policy ) cmake_policy( SET CMP0003 NEW ) @@ -13,6 +13,7 @@ include( CheckIncludeFile ) include( CheckIncludeFiles ) include( CheckLibraryExists ) include( FindPkgConfig ) +include( TargetArch ) if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE ) option( NO_STRIP "Do not strip Release or MinSizeRel builds" ) @@ -32,7 +33,9 @@ if( APPLE ) option( OSX_COCOA_BACKEND "Use native Cocoa backend instead of SDL" ON ) endif() -if( CMAKE_SIZEOF_VOID_P MATCHES "8" ) +target_architecture(ZDOOM_TARGET_ARCH) + +if( ${ZDOOM_TARGET_ARCH} MATCHES "x86_64" ) set( X64 64 ) endif() @@ -347,6 +350,11 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE ) endif () endif () + if( NOT X64 AND NOT CAN_DO_MFPMATH ) + set( CMAKE_C_FLAGS "-DNO_SSE ${CMAKE_CXX_FLAGS}" ) + set( CMAKE_CXX_FLAGS "-DNO_SSE ${CMAKE_CXX_FLAGS}" ) + endif() + # Use the highest C++ standard available since VS2015 compiles with C++14 # but we only require C++11. The recommended way to do this in CMake is to # probably to use target_compile_features, but I don't feel like maintaining diff --git a/src/gl/utility/gl_clock.h b/src/gl/utility/gl_clock.h index 957090861..fb96a69a2 100644 --- a/src/gl/utility/gl_clock.h +++ b/src/gl/utility/gl_clock.h @@ -14,11 +14,7 @@ extern double gl_MillisecPerCycle; __forceinline int64_t GetClockCycle () { -#if _M_X64 return __rdtsc(); -#else - return CPU.bRDTSC ? __rdtsc() : 0; -#endif } #elif defined __APPLE__ && (defined __i386__ || defined __x86_64__) diff --git a/src/polyrenderer/drawers/poly_drawer32_avx2.h b/src/polyrenderer/drawers/poly_drawer32_avx2.h deleted file mode 100644 index 9091ae21a..000000000 --- a/src/polyrenderer/drawers/poly_drawer32_avx2.h +++ /dev/null @@ -1,739 +0,0 @@ -/* -** Projected triangle drawer -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -#pragma once - -#include "screen_triangle.h" - -#ifdef _MSC_VER -#pragma warning(disable: 4752) // warning C4752 : found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -namespace TriScreenDrawerModes -{ - template - FORCEINLINE unsigned int VECTORCALL Sample32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) - { - uint32_t texel; - if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) - { - return color; - } - else if (SamplerT::Mode == (int)Samplers::Translated) - { - const uint8_t *texpal = (const uint8_t *)texPixels; - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - return translation[texpal[texelX * texHeight + texelY]]; - } - else if (FilterModeT::Mode == (int)FilterModes::Nearest) - { - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - texel = texPixels[texelX * texHeight + texelY]; - } - else - { - u -= oneU >> 1; - v -= oneV >> 1; - - unsigned int frac_x0 = (((uint32_t)u << 8) >> FRACBITS) * texWidth; - unsigned int frac_x1 = ((((uint32_t)u << 8) + oneU) >> FRACBITS) * texWidth; - unsigned int frac_y0 = (((uint32_t)v << 8) >> FRACBITS) * texHeight; - unsigned int frac_y1 = ((((uint32_t)v << 8) + oneV) >> FRACBITS) * texHeight; - unsigned int x0 = frac_x0 >> FRACBITS; - unsigned int x1 = frac_x1 >> FRACBITS; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = texPixels[x0 * texHeight + y0]; - unsigned int p01 = texPixels[x0 * texHeight + y1]; - unsigned int p10 = texPixels[x1 * texHeight + y0]; - unsigned int p11 = texPixels[x1 * texHeight + y1]; - - unsigned int inv_a = (frac_x1 >> (FRACBITS - 4)) & 15; - unsigned int inv_b = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - texel = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - } - - if (SamplerT::Mode == (int)Samplers::Skycap) - { - int start_fade = 2; // How fast it should fade out - - int alpha_top = clamp(v >> (16 - start_fade), 0, 256); - int alpha_bottom = clamp(((2 << 24) - v) >> (16 - start_fade), 0, 256); - int a = MIN(alpha_top, alpha_bottom); - int inv_a = 256 - a; - - uint32_t r = RPART(texel); - uint32_t g = GPART(texel); - uint32_t b = BPART(texel); - uint32_t fg_a = APART(texel); - uint32_t bg_red = RPART(color); - uint32_t bg_green = GPART(color); - uint32_t bg_blue = BPART(color); - r = (r * a + bg_red * inv_a + 127) >> 8; - g = (g * a + bg_green * inv_a + 127) >> 8; - b = (b * a + bg_blue * inv_a + 127) >> 8; - return MAKEARGB(fg_a, r, g, b); - } - else - { - return texel; - } - } - - template - FORCEINLINE unsigned int VECTORCALL SampleShade32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) - { - if (SamplerT::Mode == (int)Samplers::Shaded) - { - const uint8_t *texpal = (const uint8_t *)texPixels; - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - unsigned int sampleshadeout = texpal[texelX * texHeight + texelY]; - sampleshadeout += sampleshadeout >> 7; // 255 -> 256 - return sampleshadeout; - } - else if (SamplerT::Mode == (int)Samplers::Stencil) - { - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]); - sampleshadeout += sampleshadeout >> 7; // 255 -> 256 - return sampleshadeout; - } - else if (SamplerT::Mode == (int)Samplers::Fuzz) - { - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]); - sampleshadeout += sampleshadeout >> 7; // 255 -> 256 - sampleshadeout = (sampleshadeout * fuzzcolormap[fuzzpos++]) >> 5; - if (fuzzpos >= FUZZTABLE) fuzzpos = 0; - return sampleshadeout; - } - else - { - return 0; - } - } - - template - FORCEINLINE __m256i VECTORCALL Shade32_AVX2(__m256i fgcolor, __m256i mlight, __m256i desaturate, __m256i inv_desaturate, __m256i shade_fade, __m256i shade_light) - { - if (ShadeModeT::Mode == (int)ShadeMode::Simple) - { - fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, mlight), 8); - } - else if (ShadeModeT::Mode == (int)ShadeMode::Advanced) - { - __m256i intensity = _mm256_mullo_epi16(fgcolor, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); - intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 32)); - intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 16)); - intensity = _mm256_srli_epi16(intensity, 8); - intensity = _mm256_mullo_epi16(intensity, desaturate); - intensity = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(intensity, _MM_SHUFFLE(3, 0, 0, 0)), _MM_SHUFFLE(3, 0, 0, 0)); - - fgcolor = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm256_mullo_epi16(fgcolor, mlight); - fgcolor = _mm256_srli_epi16(_mm256_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, shade_light), 8); - } - return fgcolor; - } - - template - FORCEINLINE __m256i VECTORCALL Blend32_AVX2(__m256i fgcolor, __m256i bgcolor, __m256i ifgcolor, __m256i ifgshade, __m256i srcalpha, __m256i destalpha) - { - if (BlendT::Mode == (int)BlendModes::Opaque) - { - __m256i outcolor = fgcolor; - outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); - return outcolor; - } - else if (BlendT::Mode == (int)BlendModes::Masked) - { - __m256i mask = _mm256_cmpeq_epi32(_mm256_packus_epi16(fgcolor, _mm256_setzero_si256()), _mm256_setzero_si256()); - mask = _mm256_unpacklo_epi8(mask, _mm256_setzero_si256()); - __m256i outcolor = _mm256_or_si256(_mm256_and_si256(mask, bgcolor), _mm256_andnot_si256(mask, fgcolor)); - outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); - outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); - return outcolor; - } - else if (BlendT::Mode == (int)BlendModes::AddSrcColorOneMinusSrcColor) - { - __m256i inv_srccolor = _mm256_sub_epi16(_mm256_set1_epi16(256), _mm256_add_epi16(fgcolor, _mm256_srli_epi16(fgcolor, 7))); - __m256i outcolor = _mm256_add_epi16(fgcolor, _mm256_srli_epi16(_mm256_mullo_epi16(bgcolor, inv_srccolor), 8)); - outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); - return outcolor; - } - else if (BlendT::Mode == (int)BlendModes::Shaded) - { - ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8); - __m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); - __m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha); - - fgcolor = _mm256_mullo_epi16(fgcolor, alpha); - bgcolor = _mm256_mullo_epi16(bgcolor, inv_alpha); - __m256i outcolor = _mm256_srli_epi16(_mm256_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); - outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); - return outcolor; - } - else if (BlendT::Mode == (int)BlendModes::AddClampShaded) - { - ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8); - __m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); - __m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha); - - fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, alpha), 8); - __m256i outcolor = _mm256_add_epi16(fgcolor, bgcolor); - outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); - outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); - return outcolor; - } - else - { - __m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); - alpha = _mm256_srli_epi16(_mm256_add_epi16(alpha, _mm256_srli_epi16(alpha, 7)), 1); // 255->128 - __m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(128), alpha); - - __m256i bgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(destalpha, alpha), _mm256_slli_epi16(inv_alpha, 8)), _mm256_set1_epi32(64)), 7); - __m256i fgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(srcalpha, alpha), _mm256_set1_epi32(64)), 7); - - fgcolor = _mm256_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm256_mullo_epi16(bgcolor, bgalpha); - - __m256i fg_lo = _mm256_unpacklo_epi16(fgcolor, _mm256_setzero_si256()); - __m256i bg_lo = _mm256_unpacklo_epi16(bgcolor, _mm256_setzero_si256()); - __m256i fg_hi = _mm256_unpackhi_epi16(fgcolor, _mm256_setzero_si256()); - __m256i bg_hi = _mm256_unpackhi_epi16(bgcolor, _mm256_setzero_si256()); - - __m256i out_lo, out_hi; - if (BlendT::Mode == (int)BlendModes::AddClamp) - { - out_lo = _mm256_add_epi32(fg_lo, bg_lo); - out_hi = _mm256_add_epi32(fg_hi, bg_hi); - } - else if (BlendT::Mode == (int)BlendModes::SubClamp) - { - out_lo = _mm256_sub_epi32(fg_lo, bg_lo); - out_hi = _mm256_sub_epi32(fg_hi, bg_hi); - } - else if (BlendT::Mode == (int)BlendModes::RevSubClamp) - { - out_lo = _mm256_sub_epi32(bg_lo, fg_lo); - out_hi = _mm256_sub_epi32(bg_hi, fg_hi); - } - - out_lo = _mm256_srai_epi32(out_lo, 8); - out_hi = _mm256_srai_epi32(out_hi, 8); - __m256i outcolor = _mm256_packs_epi32(out_lo, out_hi); - outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); - outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); - return outcolor; - } - } -} - -template -class TriScreenDrawer32_AVX2 -{ -public: - static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) - { - using namespace TriScreenDrawerModes; - - bool is_simple_shade = args->uniforms->SimpleShade(); - - if (SamplerT::Mode == (int)Samplers::Texture) - { - bool is_nearest_filter = args->uniforms->NearestFilter(); - - if (is_simple_shade) - { - if (is_nearest_filter) - DrawBlock(x, y, mask0, mask1, args); - else - DrawBlock(x, y, mask0, mask1, args); - } - else - { - if (is_nearest_filter) - DrawBlock(x, y, mask0, mask1, args); - else - DrawBlock(x, y, mask0, mask1, args); - } - } - else if (SamplerT::Mode == (int)Samplers::Fuzz) - { - DrawBlock(x, y, mask0, mask1, args); - } - else // no linear filtering for translated, shaded, stencil, fill or skycap - { - if (is_simple_shade) - { - DrawBlock(x, y, mask0, mask1, args); - } - else - { - DrawBlock(x, y, mask0, mask1, args); - } - } - } - -private: - template - FORCEINLINE static void VECTORCALL DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) - { - using namespace TriScreenDrawerModes; - - bool is_fixed_light = args->uniforms->FixedLight(); - __m128i lightmask = _mm_set1_epi32(is_fixed_light ? 0 : 0xffffffff); - __m256i srcalpha = _mm256_set1_epi16(args->uniforms->SrcAlpha()); - __m256i destalpha = _mm256_set1_epi16(args->uniforms->DestAlpha()); - - int fuzzpos = (ScreenTriangle::FuzzStart + destX * 123 + destY) % FUZZTABLE; - - // Light - uint32_t light = args->uniforms->Light(); - float shade = MIN(2.0f - (light + 12.0f) / 128.0f, 31.0f / 32.0f); - float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); - light += (light >> 7); // 255 -> 256 - light <<= 8; - __m128i fixedlight = _mm_set1_epi32(light); - - // Calculate gradients - const TriVertex &v1 = *args->v1; - __m128 gradientX = _mm_setr_ps(args->gradientX.W, args->gradientX.U, args->gradientX.V, 0.0f); - __m128 gradientY = _mm_setr_ps(args->gradientY.W, args->gradientY.U, args->gradientY.V, 0.0f); - __m128 blockPosY = _mm_add_ps(_mm_add_ps( - _mm_setr_ps(v1.w, v1.u * v1.w, v1.v * v1.w, globVis), - _mm_mul_ps(gradientX, _mm_set1_ps(destX - v1.x))), - _mm_mul_ps(gradientY, _mm_set1_ps(destY - v1.y))); - gradientX = _mm_mul_ps(gradientX, _mm_set1_ps(8.0f)); - - // Output - uint32_t * RESTRICT destOrg = (uint32_t*)args->dest; - int pitch = args->pitch; - uint32_t *dest = destOrg + destX + destY * pitch; - int offset_next_line = pitch - 8; - - // Sampling stuff - uint32_t color = args->uniforms->Color(); - const uint32_t * RESTRICT translation = (const uint32_t *)args->uniforms->Translation(); - const uint32_t * RESTRICT texPixels = (const uint32_t *)args->uniforms->TexturePixels(); - uint32_t texWidth = args->uniforms->TextureWidth(); - uint32_t texHeight = args->uniforms->TextureHeight(); - uint32_t oneU, oneV; - if (SamplerT::Mode != (int)Samplers::Fill) - { - oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1; - oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1; - } - else - { - oneU = 0; - oneV = 0; - } - - // Shade constants - __m256i inv_desaturate, shade_fade, shade_light; - __m256i desaturate; - if (ShadeModeT::Mode == (int)ShadeMode::Advanced) - { - inv_desaturate = _mm256_setr_epi16( - 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), - 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), - 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), - 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate()); - shade_fade = _mm256_set_epi16( - args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(), - args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(), - args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(), - args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue()); - shade_light = _mm256_set_epi16( - args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(), - args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(), - args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(), - args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue()); - desaturate = _mm256_sub_epi16(_mm256_set1_epi16(256), inv_desaturate); - } - else - { - inv_desaturate = _mm256_setzero_si256(); - shade_fade = _mm256_setzero_si256(); - shade_fade = _mm256_setzero_si256(); - shade_light = _mm256_setzero_si256(); - desaturate = _mm256_setzero_si256(); - } - - if (mask0 == 0xffffffff && mask1 == 0xffffffff) - { - for (int y = 0; y < 8; y++) - { - __m128 blockPosX = _mm_add_ps(blockPosY, gradientX); - __m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W); - __m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW)); - - __m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W); - __m128i lightpospair = _mm_sub_epi32( - _mm_set1_epi32(FRACUNIT), - _mm_cvtps_epi32(_mm_mul_ps( - _mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()), - _mm_set1_ps((float)FRACUNIT)))); - lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight)); - - int32_t posU = _mm_cvtsi128_si32(posUV); - int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4)); - int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8)); - int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12)); - int32_t lightpos = _mm_cvtsi128_si32(lightpospair); - int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8)); - int32_t stepU = (nextU - posU) >> 3; - int32_t stepV = (nextV - posV) >> 3; - fixed_t lightstep = (lightnext - lightpos) >> 3; - - for (int ix = 0; ix < 2; ix++) - { - // Load bgcolor - __m256i bgcolor; - if (BlendT::Mode != (int)BlendModes::Opaque) - { - __m128i bgpacked = _mm_loadu_si128((__m128i*)dest); - bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128())); - } - else - bgcolor = _mm256_setzero_si256(); - - // Sample fgcolor - unsigned int ifgcolor0 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade0 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor1 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade1 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor2 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade2 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor3 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade3 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - // Setup light - int lightpos0 = lightpos >> 8; - lightpos += lightstep; - int lightpos1 = lightpos >> 8; - lightpos += lightstep; - int lightpos2 = lightpos >> 8; - lightpos += lightstep; - int lightpos3 = lightpos >> 8; - lightpos += lightstep; - __m256i mlight = _mm256_set_epi16( - 256, lightpos3, lightpos3, lightpos3, - 256, lightpos2, lightpos2, lightpos2, - 256, lightpos1, lightpos1, lightpos1, - 256, lightpos0, lightpos0, lightpos0); - - __m256i shade_fade_lit; - if (ShadeModeT::Mode == (int)ShadeMode::Advanced) - { - __m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight); - shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light); - } - else - { - shade_fade_lit = _mm256_setzero_si256(); - } - - // Shade and blend - __m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0); - __m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0); - __m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128())); - __m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked)); - __m256i fgcolor = mifgcolor; - fgcolor = Shade32_AVX2(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m256i outcolor = Blend32_AVX2(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha); - - // Store result - _mm_storeu_si128((__m128i*)dest, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8))); - dest += 4; - } - - blockPosY = _mm_add_ps(blockPosY, gradientY); - - dest += offset_next_line; - } - } - else - { - // mask0 loop: - for (int y = 0; y < 4; y++) - { - __m128 blockPosX = _mm_add_ps(blockPosY, gradientX); - __m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W); - __m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW)); - - __m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W); - __m128i lightpospair = _mm_sub_epi32( - _mm_set1_epi32(FRACUNIT), - _mm_cvtps_epi32(_mm_mul_ps( - _mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()), - _mm_set1_ps((float)FRACUNIT)))); - lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight)); - - int32_t posU = _mm_cvtsi128_si32(posUV); - int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4)); - int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8)); - int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12)); - int32_t lightpos = _mm_cvtsi128_si32(lightpospair); - int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8)); - int32_t stepU = (nextU - posU) >> 3; - int32_t stepV = (nextV - posV) >> 3; - fixed_t lightstep = (lightnext - lightpos) >> 3; - - for (int x = 0; x < 2; x++) - { - // Load bgcolor - uint32_t desttmp[4]; - __m256i bgcolor; - if (BlendT::Mode != (int)BlendModes::Opaque) - { - if (mask0 & (1 << 31)) desttmp[0] = dest[0]; - if (mask0 & (1 << 30)) desttmp[1] = dest[1]; - if (mask0 & (1 << 29)) desttmp[2] = dest[2]; - if (mask0 & (1 << 28)) desttmp[3] = dest[3]; - - __m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp)); - bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128())); - } - else - bgcolor = _mm256_setzero_si256(); - - // Sample fgcolor - unsigned int ifgcolor0 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade0 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor1 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade1 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor2 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade2 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor3 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade3 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - // Setup light - int lightpos0 = lightpos >> 8; - lightpos += lightstep; - int lightpos1 = lightpos >> 8; - lightpos += lightstep; - int lightpos2 = lightpos >> 8; - lightpos += lightstep; - int lightpos3 = lightpos >> 8; - lightpos += lightstep; - __m256i mlight = _mm256_set_epi16( - 256, lightpos3, lightpos3, lightpos3, - 256, lightpos2, lightpos2, lightpos2, - 256, lightpos1, lightpos1, lightpos1, - 256, lightpos0, lightpos0, lightpos0); - - __m256i shade_fade_lit; - if (ShadeModeT::Mode == (int)ShadeMode::Advanced) - { - __m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight); - shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light); - } - else - { - shade_fade_lit = _mm256_setzero_si256(); - } - - // Shade and blend - __m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0); - __m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0); - __m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128())); - __m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked)); - __m256i fgcolor = mifgcolor; - fgcolor = Shade32_AVX2(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m256i outcolor = Blend32_AVX2(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha); - - // Store result - _mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8))); - if (mask0 & (1 << 31)) dest[0] = desttmp[0]; - if (mask0 & (1 << 30)) dest[1] = desttmp[1]; - if (mask0 & (1 << 29)) dest[2] = desttmp[2]; - if (mask0 & (1 << 28)) dest[3] = desttmp[3]; - - mask0 <<= 4; - dest += 4; - } - - blockPosY = _mm_add_ps(blockPosY, gradientY); - - dest += offset_next_line; - } - - // mask1 loop: - for (int y = 0; y < 4; y++) - { - __m128 blockPosX = _mm_add_ps(blockPosY, gradientX); - __m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W); - __m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW)); - - __m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W); - __m128i lightpospair = _mm_sub_epi32( - _mm_set1_epi32(FRACUNIT), - _mm_cvtps_epi32(_mm_mul_ps( - _mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()), - _mm_set1_ps((float)FRACUNIT)))); - lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight)); - - int32_t posU = _mm_cvtsi128_si32(posUV); - int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4)); - int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8)); - int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12)); - int32_t lightpos = _mm_cvtsi128_si32(lightpospair); - int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8)); - int32_t stepU = (nextU - posU) >> 3; - int32_t stepV = (nextV - posV) >> 3; - fixed_t lightstep = (lightnext - lightpos) >> 3; - - for (int x = 0; x < 2; x++) - { - // Load bgcolor - uint32_t desttmp[4]; - __m256i bgcolor; - if (BlendT::Mode != (int)BlendModes::Opaque) - { - if (mask1 & (1 << 31)) desttmp[0] = dest[0]; - if (mask1 & (1 << 30)) desttmp[1] = dest[1]; - if (mask1 & (1 << 29)) desttmp[2] = dest[2]; - if (mask1 & (1 << 28)) desttmp[3] = dest[3]; - - __m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp)); - bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128())); - } - else - bgcolor = _mm256_setzero_si256(); - - // Sample fgcolor - unsigned int ifgcolor0 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade0 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor1 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade1 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor2 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade2 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - unsigned int ifgcolor3 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - unsigned int ifgshade3 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); - posU += stepU; - posV += stepV; - - // Setup light - int lightpos0 = lightpos >> 8; - lightpos += lightstep; - int lightpos1 = lightpos >> 8; - lightpos += lightstep; - int lightpos2 = lightpos >> 8; - lightpos += lightstep; - int lightpos3 = lightpos >> 8; - lightpos += lightstep; - __m256i mlight = _mm256_set_epi16( - 256, lightpos3, lightpos3, lightpos3, - 256, lightpos2, lightpos2, lightpos2, - 256, lightpos1, lightpos1, lightpos1, - 256, lightpos0, lightpos0, lightpos0); - - __m256i shade_fade_lit; - if (ShadeModeT::Mode == (int)ShadeMode::Advanced) - { - __m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight); - shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light); - } - else - { - shade_fade_lit = _mm256_setzero_si256(); - } - - // Shade and blend - __m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0); - __m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0); - __m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128())); - __m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked)); - __m256i fgcolor = mifgcolor; - fgcolor = Shade32_AVX2(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m256i outcolor = Blend32_AVX2(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha); - - // Store result - _mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8))); - if (mask1 & (1 << 31)) dest[0] = desttmp[0]; - if (mask1 & (1 << 30)) dest[1] = desttmp[1]; - if (mask1 & (1 << 29)) dest[2] = desttmp[2]; - if (mask1 & (1 << 28)) dest[3] = desttmp[3]; - - mask1 <<= 4; - dest += 4; - } - - blockPosY = _mm_add_ps(blockPosY, gradientY); - - dest += offset_next_line; - } - } - } -}; diff --git a/src/polyrenderer/drawers/poly_drawer32_sse2.h b/src/polyrenderer/drawers/poly_drawer32_sse2.h index 2f690f7e8..5125c93c7 100644 --- a/src/polyrenderer/drawers/poly_drawer32_sse2.h +++ b/src/polyrenderer/drawers/poly_drawer32_sse2.h @@ -27,7 +27,7 @@ namespace TriScreenDrawerModes { template - FORCEINLINE unsigned int VECTORCALL Sample32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) + FORCEINLINE unsigned int VECTORCALL Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) { uint32_t texel; if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) @@ -107,7 +107,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE unsigned int VECTORCALL SampleShade32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) + FORCEINLINE unsigned int VECTORCALL SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) { if (SamplerT::Mode == (int)Samplers::Shaded) { @@ -143,7 +143,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE __m128i VECTORCALL Shade32_SSE2(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) + FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) { if (ShadeModeT::Mode == (int)ShadeMode::Simple) { @@ -172,7 +172,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE __m128i VECTORCALL Blend32_SSE2(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) + FORCEINLINE __m128i VECTORCALL Blend32(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) { if (BlendT::Mode == (int)BlendModes::Opaque) { @@ -275,7 +275,7 @@ namespace TriScreenDrawerModes } template -class TriScreenDrawer32_SSE2 +class TriScreenDrawer32 { public: static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) @@ -430,13 +430,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -460,8 +460,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor); @@ -517,13 +517,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -547,8 +547,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)desttmp, outcolor); @@ -606,13 +606,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -636,8 +636,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)desttmp, outcolor); @@ -658,7 +658,7 @@ private: }; template -class RectScreenDrawer32_SSE2 +class RectScreenDrawer32 { public: static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread) @@ -780,18 +780,18 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; - ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)dest, outcolor); @@ -809,16 +809,16 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgcolor[1] = 0; ifgshade[1] = 0; posU += stepU; // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result *dest = _mm_cvtsi128_si32(outcolor); diff --git a/src/polyrenderer/drawers/poly_triangle.cpp b/src/polyrenderer/drawers/poly_triangle.cpp index e4d91a65e..84dd13897 100644 --- a/src/polyrenderer/drawers/poly_triangle.cpp +++ b/src/polyrenderer/drawers/poly_triangle.cpp @@ -152,8 +152,14 @@ ShadedTriVertex PolyTriangleDrawer::shade_vertex(const TriMatrix &objectToClip, return sv; } -void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipvert) +void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) { + // Cull, clip and generate additional vertices as needed + TriVertex clippedvert[max_additional_vertices]; + int numclipvert = clipedge(vert, clippedvert); + +#ifdef NO_SSE + // Map to 2D viewport: for (int j = 0; j < numclipvert; j++) { auto &v = clippedvert[j]; @@ -168,11 +174,8 @@ void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipver v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; } -} - -#ifndef NO_SSE -void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert) -{ +#else + // Map to 2D viewport: __m128 mviewport_x = _mm_set1_ps((float)viewport_x); __m128 mviewport_y = _mm_set1_ps((float)viewport_y); __m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f); @@ -203,21 +206,8 @@ void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numcl _mm_storeu_ps(&clippedvert[j + 2].x, vz); _mm_storeu_ps(&clippedvert[j + 3].x, vw); } -} #endif -void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) -{ - // Cull, clip and generate additional vertices as needed - TriVertex clippedvert[max_additional_vertices]; - int numclipvert = CPU.bSSE2 ? clipedge_sse2(vert, clippedvert) : clipedge(vert, clippedvert); - - // Map to 2D viewport: - if (CPU.bSSE2) - clip_to_viewport_sse2(clippedvert, numclipvert); - else - clip_to_viewport(clippedvert, numclipvert); - // Keep varyings in -128 to 128 range if possible if (numclipvert > 0) { @@ -266,6 +256,7 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe // halfspace clip distances static const int numclipdistances = 7; +#ifdef NO_SSE float clipdistance[numclipdistances * 3]; bool needsclipping = false; float *clipd = clipdistance; @@ -292,6 +283,43 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe } return 3; } +#else + __m128 mx = _mm_loadu_ps(&verts[0].x); + __m128 my = _mm_loadu_ps(&verts[1].x); + __m128 mz = _mm_loadu_ps(&verts[2].x); + __m128 mw = _mm_setzero_ps(); + _MM_TRANSPOSE4_PS(mx, my, mz, mw); + __m128 clipd0 = _mm_add_ps(mx, mw); + __m128 clipd1 = _mm_sub_ps(mw, mx); + __m128 clipd2 = _mm_add_ps(my, mw); + __m128 clipd3 = _mm_sub_ps(mw, my); + __m128 clipd4 = _mm_add_ps(mz, mw); + __m128 clipd5 = _mm_sub_ps(mw, mz); + __m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f); + __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); + if (_mm_movemask_ps(mneedsclipping) == 0) + { + for (int i = 0; i < 3; i++) + { + memcpy(clippedvert + i, verts + i, sizeof(TriVertex)); + } + return 3; + } + float clipdistance[numclipdistances * 4]; + _mm_storeu_ps(clipdistance, clipd0); + _mm_storeu_ps(clipdistance + 4, clipd1); + _mm_storeu_ps(clipdistance + 8, clipd2); + _mm_storeu_ps(clipdistance + 12, clipd3); + _mm_storeu_ps(clipdistance + 16, clipd4); + _mm_storeu_ps(clipdistance + 20, clipd5); + _mm_storeu_ps(clipdistance + 24, clipd6); +#endif // use barycentric weights while clipping vertices float weights[max_additional_vertices * 3 * 2]; @@ -314,6 +342,7 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe for (int i = 0; i < inputverts; i++) { int j = (i + 1) % inputverts; +#ifdef NO_SSE float clipdistance1 = clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] + clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] + @@ -323,6 +352,17 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] + clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] + clipdistance[2 * numclipdistances + p] * input[j * 3 + 2]; +#else + float clipdistance1 = + clipdistance[0 + p * 4] * input[i * 3 + 0] + + clipdistance[1 + p * 4] * input[i * 3 + 1] + + clipdistance[2 + p * 4] * input[i * 3 + 2]; + + float clipdistance2 = + clipdistance[0 + p * 4] * input[j * 3 + 0] + + clipdistance[1 + p * 4] * input[j * 3 + 1] + + clipdistance[2 + p * 4] * input[j * 3 + 2]; +#endif // Clip halfspace if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) @@ -369,129 +409,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe return inputverts; } -#ifndef NO_SSE -int PolyTriangleDrawer::clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert) -{ - // Clip and cull so that the following is true for all vertices: - // -v.w <= v.x <= v.w - // -v.w <= v.y <= v.w - // -v.w <= v.z <= v.w - - // halfspace clip distances - static const int numclipdistances = 7; - __m128 mx = _mm_loadu_ps(&verts[0].x); - __m128 my = _mm_loadu_ps(&verts[1].x); - __m128 mz = _mm_loadu_ps(&verts[2].x); - __m128 mw = _mm_setzero_ps(); - _MM_TRANSPOSE4_PS(mx, my, mz, mw); - __m128 clipd0 = _mm_add_ps(mx, mw); - __m128 clipd1 = _mm_sub_ps(mw, mx); - __m128 clipd2 = _mm_add_ps(my, mw); - __m128 clipd3 = _mm_sub_ps(mw, my); - __m128 clipd4 = _mm_add_ps(mz, mw); - __m128 clipd5 = _mm_sub_ps(mw, mz); - __m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f); - __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); - if (_mm_movemask_ps(mneedsclipping) == 0) - { - for (int i = 0; i < 3; i++) - { - memcpy(clippedvert + i, verts + i, sizeof(TriVertex)); - } - return 3; - } - float clipdistance[numclipdistances * 4]; - _mm_storeu_ps(clipdistance, clipd0); - _mm_storeu_ps(clipdistance + 4, clipd1); - _mm_storeu_ps(clipdistance + 8, clipd2); - _mm_storeu_ps(clipdistance + 12, clipd3); - _mm_storeu_ps(clipdistance + 16, clipd4); - _mm_storeu_ps(clipdistance + 20, clipd5); - _mm_storeu_ps(clipdistance + 24, clipd6); - - // use barycentric weights while clipping vertices - float weights[max_additional_vertices * 3 * 2]; - for (int i = 0; i < 3; i++) - { - weights[i * 3 + 0] = 0.0f; - weights[i * 3 + 1] = 0.0f; - weights[i * 3 + 2] = 0.0f; - weights[i * 3 + i] = 1.0f; - } - - // Clip against each halfspace - float *input = weights; - float *output = weights + max_additional_vertices * 3; - int inputverts = 3; - for (int p = 0; p < numclipdistances; p++) - { - // Clip each edge - int outputverts = 0; - for (int i = 0; i < inputverts; i++) - { - int j = (i + 1) % inputverts; - float clipdistance1 = - clipdistance[0 + p * 4] * input[i * 3 + 0] + - clipdistance[1 + p * 4] * input[i * 3 + 1] + - clipdistance[2 + p * 4] * input[i * 3 + 2]; - - float clipdistance2 = - clipdistance[0 + p * 4] * input[j * 3 + 0] + - clipdistance[1 + p * 4] * input[j * 3 + 1] + - clipdistance[2 + p * 4] * input[j * 3 + 2]; - - // Clip halfspace - if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) - { - float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f; - float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f; - - // add t1 vertex - for (int k = 0; k < 3; k++) - output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1; - outputverts++; - - if (t2 != 1.0f && t2 > t1) - { - // add t2 vertex - for (int k = 0; k < 3; k++) - output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2; - outputverts++; - } - } - } - std::swap(input, output); - inputverts = outputverts; - if (inputverts == 0) - break; - } - - // Convert barycentric weights to actual vertices - for (int i = 0; i < inputverts; i++) - { - auto &v = clippedvert[i]; - memset(&v, 0, sizeof(TriVertex)); - for (int w = 0; w < 3; w++) - { - float weight = input[i * 3 + w]; - v.x += verts[w].x * weight; - v.y += verts[w].y * weight; - v.z += verts[w].z * weight; - v.w += verts[w].w * weight; - v.u += verts[w].u * weight; - v.v += verts[w].v * weight; - } - } - return inputverts; -} -#endif - ///////////////////////////////////////////////////////////////////////////// DrawPolyTrianglesCommand::DrawPolyTrianglesCommand(const PolyDrawArgs &args, bool mirror) diff --git a/src/polyrenderer/drawers/poly_triangle.h b/src/polyrenderer/drawers/poly_triangle.h index ca92ac735..c939149d3 100644 --- a/src/polyrenderer/drawers/poly_triangle.h +++ b/src/polyrenderer/drawers/poly_triangle.h @@ -47,12 +47,8 @@ private: static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v); static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread); static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread); - static void clip_to_viewport(TriVertex *clippedvert, int numclipvert); + static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert); -#ifndef NO_SSE - static void clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert); - static int clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert); -#endif static int viewport_x, viewport_y, viewport_width, viewport_height, dest_pitch, dest_width, dest_height; static bool dest_bgra; diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index bbfb0361f..01dd16766 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -36,20 +36,14 @@ #include "poly_triangle.h" #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" -#include "poly_drawer32.h" -#include "poly_drawer8.h" #ifndef NO_SSE #include "poly_drawer32_sse2.h" +#else +#include "poly_drawer32.h" #endif +#include "poly_drawer8.h" #include "x86.h" -namespace -{ - class SSE2CPU { public: static const int HasSSE2 = 1; }; - class GenericCPU { public: static const int HasSSE2 = 0; }; -} - -template class TriangleBlock { public: @@ -123,17 +117,9 @@ private: void ClipTest(); void StencilWrite(); void SubsectorWrite(); - -#ifndef NO_SSE - void CoverageTestSSE2(); - void StencilEqualTestSSE2(); - void SubsectorTestSSE2(); - void SubsectorWriteSSE2(); -#endif }; -template -TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) +TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) { const TriVertex &v1 = *args->v1; const TriVertex &v2 = *args->v2; @@ -162,32 +148,19 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) const int X2 = (int)round(16.0f * v2.x); const int X3 = (int)round(16.0f * v3.x); #else - int Y1, Y2, Y3, X1, X2, X3; - if (CPUType::HasSSE2 == 1) - { - int tempround[4 * 3]; - __m128 m16 = _mm_set1_ps(16.0f); - __m128 mhalf = _mm_set1_ps(65536.5f); - __m128i m65536 = _mm_set1_epi32(65536); - _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); - _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); - _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); - X1 = tempround[0]; - X2 = tempround[4]; - X3 = tempround[8]; - Y1 = tempround[1]; - Y2 = tempround[5]; - Y3 = tempround[9]; - } - else - { - Y1 = (int)round(16.0f * v1.y); - Y2 = (int)round(16.0f * v2.y); - Y3 = (int)round(16.0f * v3.y); - X1 = (int)round(16.0f * v1.x); - X2 = (int)round(16.0f * v2.x); - X3 = (int)round(16.0f * v3.x); - } + int tempround[4 * 3]; + __m128 m16 = _mm_set1_ps(16.0f); + __m128 mhalf = _mm_set1_ps(65536.5f); + __m128i m65536 = _mm_set1_epi32(65536); + _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); + const int X1 = tempround[0]; + const int X2 = tempround[4]; + const int X3 = tempround[8]; + const int Y1 = tempround[1]; + const int Y2 = tempround[5]; + const int Y3 = tempround[9]; #endif // Deltas @@ -233,32 +206,28 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; #ifndef NO_SSE - if (CPUType::HasSSE2 == 1) - { - mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); - mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); - mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); - mFDY12x4 = _mm_set1_epi32(FDY12 * 4); - mFDY23x4 = _mm_set1_epi32(FDY23 * 4); - mFDY31x4 = _mm_set1_epi32(FDY31 * 4); - mFDX12 = _mm_set1_epi32(FDX12); - mFDX23 = _mm_set1_epi32(FDX23); - mFDX31 = _mm_set1_epi32(FDX31); - mC1 = _mm_set1_epi32(C1); - mC2 = _mm_set1_epi32(C2); - mC3 = _mm_set1_epi32(C3); - mDX12 = _mm_set1_epi32(DX12); - mDY12 = _mm_set1_epi32(DY12); - mDX23 = _mm_set1_epi32(DX23); - mDY23 = _mm_set1_epi32(DY23); - mDX31 = _mm_set1_epi32(DX31); - mDY31 = _mm_set1_epi32(DY31); - } + mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); + mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); + mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); + mFDY12x4 = _mm_set1_epi32(FDY12 * 4); + mFDY23x4 = _mm_set1_epi32(FDY23 * 4); + mFDY31x4 = _mm_set1_epi32(FDY31 * 4); + mFDX12 = _mm_set1_epi32(FDX12); + mFDX23 = _mm_set1_epi32(FDX23); + mFDX31 = _mm_set1_epi32(FDX31); + mC1 = _mm_set1_epi32(C1); + mC2 = _mm_set1_epi32(C2); + mC3 = _mm_set1_epi32(C3); + mDX12 = _mm_set1_epi32(DX12); + mDY12 = _mm_set1_epi32(DY12); + mDX23 = _mm_set1_epi32(DX23); + mDY23 = _mm_set1_epi32(DY23); + mDX31 = _mm_set1_epi32(DX31); + mDY31 = _mm_set1_epi32(DY31); #endif } -template -void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { // First block line for this thread int core = thread->core; @@ -270,18 +239,9 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadD bool writeColor = args->uniforms->WriteColor(); bool writeStencil = args->uniforms->WriteStencil(); bool writeSubsector = args->uniforms->WriteSubsector(); - int bmode = (int)args->uniforms->BlendMode(); - // Find the drawer function for the given blend mode -#ifndef NO_SSE - void(*drawFunc)(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); - if (CPUType::HasSSE2 == 1) - drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32_SSE2[bmode] : ScreenTriangle::TriDrawers8[bmode]; - else - drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; -#else + int bmode = (int)args->uniforms->BlendMode(); auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; -#endif // Loop through blocks for (int y = start_miny; y < maxy; y += q * num_cores) @@ -291,11 +251,7 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadD X = x; Y = y; - if (CPUType::HasSSE2 == 1) - CoverageTestSSE2(); - else - CoverageTest(); - + CoverageTest(); if (Mask0 == 0 && Mask1 == 0) continue; @@ -306,11 +262,7 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadD // To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test.. if (!subsectorTest) { - if (CPUType::HasSSE2 == 1) - StencilEqualTestSSE2(); - else - StencilEqualTest(); - + StencilEqualTest(); if (Mask0 == 0 && Mask1 == 0) continue; } @@ -320,11 +272,7 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadD if (Mask0 == 0 && Mask1 == 0) continue; - if (CPUType::HasSSE2 == 1) - SubsectorTestSSE2(); - else - SubsectorTest(); - + SubsectorTest(); if (Mask0 == 0 && Mask1 == 0) continue; } @@ -334,18 +282,14 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadD if (writeStencil) StencilWrite(); if (writeSubsector) - { - if (CPUType::HasSSE2 == 1) - SubsectorWriteSSE2(); - else - SubsectorWrite(); - } + SubsectorWrite(); } } } -template -void TriangleBlock::SubsectorTest() +#ifdef NO_SSE + +void TriangleBlock::SubsectorTest() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -371,10 +315,9 @@ void TriangleBlock::SubsectorTest() Mask1 = Mask1 & mask1; } -#ifndef NO_SSE +#else -template -void TriangleBlock::SubsectorTestSSE2() +void TriangleBlock::SubsectorTest() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -402,8 +345,7 @@ void TriangleBlock::SubsectorTestSSE2() #endif -template -void TriangleBlock::ClipTest() +void TriangleBlock::ClipTest() { static const uint32_t clipxmask[8] = { @@ -437,8 +379,9 @@ void TriangleBlock::ClipTest() Mask1 = Mask1 & xmask & ymask1; } -template -void TriangleBlock::StencilEqualTest() +#ifdef NO_SSE + +void TriangleBlock::StencilEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -481,10 +424,9 @@ void TriangleBlock::StencilEqualTest() } } -#ifndef NO_SSE +#else -template -void TriangleBlock::StencilEqualTestSSE2() +void TriangleBlock::StencilEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -550,8 +492,7 @@ void TriangleBlock::StencilEqualTestSSE2() #endif -template -void TriangleBlock::StencilGreaterEqualTest() +void TriangleBlock::StencilGreaterEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -594,8 +535,9 @@ void TriangleBlock::StencilGreaterEqualTest() } } -template -void TriangleBlock::CoverageTest() +#ifdef NO_SSE + +void TriangleBlock::CoverageTest() { // Corners of block int x0 = X << 4; @@ -692,10 +634,9 @@ void TriangleBlock::CoverageTest() } } -#ifndef NO_SSE +#else -template -void TriangleBlock::CoverageTestSSE2() +void TriangleBlock::CoverageTest() { // Corners of block int x0 = X << 4; @@ -805,8 +746,7 @@ void TriangleBlock::CoverageTestSSE2() #endif -template -void TriangleBlock::StencilWrite() +void TriangleBlock::StencilWrite() { int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; @@ -856,8 +796,9 @@ void TriangleBlock::StencilWrite() } } -template -void TriangleBlock::SubsectorWrite() +#ifdef NO_SSE + +void TriangleBlock::SubsectorWrite() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -890,10 +831,9 @@ void TriangleBlock::SubsectorWrite() } } -#ifndef NO_SSE +#else -template -void TriangleBlock::SubsectorWriteSSE2() +void TriangleBlock::SubsectorWrite() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -950,21 +890,8 @@ void TriangleBlock::SubsectorWriteSSE2() void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { -#ifdef NO_SSE - TriangleBlock block(args); + TriangleBlock block(args); block.Loop(args, thread); -#else - if (CPU.bSSE2) - { - TriangleBlock block(args); - block.Loop(args, thread); - } - else - { - TriangleBlock block(args); - block.Loop(args, thread); - } -#endif } void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = @@ -1021,37 +948,6 @@ void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDra &TriScreenDrawer32::Execute // Fuzz }; -#ifndef NO_SSE - -void(*ScreenTriangle::TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = -{ - &TriScreenDrawer32_SSE2::Execute, // TextureOpaque - &TriScreenDrawer32_SSE2::Execute, // TextureMasked - &TriScreenDrawer32_SSE2::Execute, // TextureAdd - &TriScreenDrawer32_SSE2::Execute, // TextureSub - &TriScreenDrawer32_SSE2::Execute, // TextureRevSub - &TriScreenDrawer32_SSE2::Execute, // TextureAddSrcColor - &TriScreenDrawer32_SSE2::Execute, // TranslatedOpaque - &TriScreenDrawer32_SSE2::Execute, // TranslatedMasked - &TriScreenDrawer32_SSE2::Execute, // TranslatedAdd - &TriScreenDrawer32_SSE2::Execute, // TranslatedSub - &TriScreenDrawer32_SSE2::Execute, // TranslatedRevSub - &TriScreenDrawer32_SSE2::Execute, // TranslatedAddSrcColor - &TriScreenDrawer32_SSE2::Execute, // Shaded - &TriScreenDrawer32_SSE2::Execute, // AddShaded - &TriScreenDrawer32_SSE2::Execute, // Stencil - &TriScreenDrawer32_SSE2::Execute, // AddStencil - &TriScreenDrawer32_SSE2::Execute, // FillOpaque - &TriScreenDrawer32_SSE2::Execute, // FillAdd - &TriScreenDrawer32_SSE2::Execute, // FillSub - &TriScreenDrawer32_SSE2::Execute, // FillRevSub - &TriScreenDrawer32_SSE2::Execute, // FillAddSrcColor - &TriScreenDrawer32_SSE2::Execute, // Skycap - &TriScreenDrawer32_SSE2::Execute // Fuzz -}; - -#endif - void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { &RectScreenDrawer8::Execute, // TextureOpaque @@ -1106,35 +1002,4 @@ void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDr &RectScreenDrawer32::Execute // Fuzz }; -#ifndef NO_SSE - -void(*ScreenTriangle::RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = -{ - &RectScreenDrawer32_SSE2::Execute, // TextureOpaque - &RectScreenDrawer32_SSE2::Execute, // TextureMasked - &RectScreenDrawer32_SSE2::Execute, // TextureAdd - &RectScreenDrawer32_SSE2::Execute, // TextureSub - &RectScreenDrawer32_SSE2::Execute, // TextureRevSub - &RectScreenDrawer32_SSE2::Execute, // TextureAddSrcColor - &RectScreenDrawer32_SSE2::Execute, // TranslatedOpaque - &RectScreenDrawer32_SSE2::Execute, // TranslatedMasked - &RectScreenDrawer32_SSE2::Execute, // TranslatedAdd - &RectScreenDrawer32_SSE2::Execute, // TranslatedSub - &RectScreenDrawer32_SSE2::Execute, // TranslatedRevSub - &RectScreenDrawer32_SSE2::Execute, // TranslatedAddSrcColor - &RectScreenDrawer32_SSE2::Execute, // Shaded - &RectScreenDrawer32_SSE2::Execute, // AddShaded - &RectScreenDrawer32_SSE2::Execute, // Stencil - &RectScreenDrawer32_SSE2::Execute, // AddStencil - &RectScreenDrawer32_SSE2::Execute, // FillOpaque - &RectScreenDrawer32_SSE2::Execute, // FillAdd - &RectScreenDrawer32_SSE2::Execute, // FillSub - &RectScreenDrawer32_SSE2::Execute, // FillRevSub - &RectScreenDrawer32_SSE2::Execute, // FillAddSrcColor - &RectScreenDrawer32_SSE2::Execute, // Skycap - &RectScreenDrawer32_SSE2::Execute // Fuzz -}; - -#endif - int ScreenTriangle::FuzzStart = 0; diff --git a/src/polyrenderer/drawers/screen_triangle.h b/src/polyrenderer/drawers/screen_triangle.h index 615a0c631..3dd4c24eb 100644 --- a/src/polyrenderer/drawers/screen_triangle.h +++ b/src/polyrenderer/drawers/screen_triangle.h @@ -131,11 +131,6 @@ public: static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); -#ifndef NO_SSE - static void(*TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); - static void(*RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); -#endif - static int FuzzStart; }; diff --git a/src/polyrenderer/math/tri_matrix.cpp b/src/polyrenderer/math/tri_matrix.cpp index 5c88a96ec..f6bb03a5e 100644 --- a/src/polyrenderer/math/tri_matrix.cpp +++ b/src/polyrenderer/math/tri_matrix.cpp @@ -185,42 +185,21 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const sv.y = vy; sv.z = vz; sv.w = vw; +#else + __m128 m0 = _mm_loadu_ps(matrix); + __m128 m1 = _mm_loadu_ps(matrix + 4); + __m128 m2 = _mm_loadu_ps(matrix + 8); + __m128 m3 = _mm_loadu_ps(matrix + 12); + __m128 mv = _mm_loadu_ps(&v.x); + m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0))); + m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1))); + m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2))); + m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3))); + mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3); + ShadedTriVertex sv; + _mm_storeu_ps(&sv.x, mv); +#endif sv.u = v.u; sv.v = v.v; return sv; -#else - if (CPU.bSSE2) - { - __m128 m0 = _mm_loadu_ps(matrix); - __m128 m1 = _mm_loadu_ps(matrix + 4); - __m128 m2 = _mm_loadu_ps(matrix + 8); - __m128 m3 = _mm_loadu_ps(matrix + 12); - __m128 mv = _mm_loadu_ps(&v.x); - m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0))); - m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1))); - m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2))); - m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3))); - mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3); - ShadedTriVertex sv; - _mm_storeu_ps(&sv.x, mv); - sv.u = v.u; - sv.v = v.v; - return sv; - } - else - { - float vx = matrix[0 * 4 + 0] * v.x + matrix[1 * 4 + 0] * v.y + matrix[2 * 4 + 0] * v.z + matrix[3 * 4 + 0] * v.w; - float vy = matrix[0 * 4 + 1] * v.x + matrix[1 * 4 + 1] * v.y + matrix[2 * 4 + 1] * v.z + matrix[3 * 4 + 1] * v.w; - float vz = matrix[0 * 4 + 2] * v.x + matrix[1 * 4 + 2] * v.y + matrix[2 * 4 + 2] * v.z + matrix[3 * 4 + 2] * v.w; - float vw = matrix[0 * 4 + 3] * v.x + matrix[1 * 4 + 3] * v.y + matrix[2 * 4 + 3] * v.z + matrix[3 * 4 + 3] * v.w; - ShadedTriVertex sv; - sv.x = vx; - sv.y = vy; - sv.z = vz; - sv.w = vw; - sv.u = v.u; - sv.v = v.v; - return sv; - } -#endif } diff --git a/src/posix/cocoa/i_timer.cpp b/src/posix/cocoa/i_timer.cpp index 85a0ae8ce..ac5c49d3e 100644 --- a/src/posix/cocoa/i_timer.cpp +++ b/src/posix/cocoa/i_timer.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include "doomdef.h" #include "i_system.h" @@ -124,12 +123,7 @@ void* TimerThreadFunc(void*) if (!s_isTicFrozen) { - // The following GCC/Clang intrinsic can be used instead of OS X specific function: - // __sync_add_and_fetch(&s_tics, 1); - // Although it's not supported on all platform/compiler combination, - // e.g. GCC 4.0.1 with PowerPC target architecture - - OSAtomicIncrement32(&s_tics); + __sync_add_and_fetch(&s_tics, 1); } s_timerStart = I_MSTime(); diff --git a/src/posix/cocoa/i_video.mm b/src/posix/cocoa/i_video.mm index afeddf10d..9b22b5495 100644 --- a/src/posix/cocoa/i_video.mm +++ b/src/posix/cocoa/i_video.mm @@ -116,11 +116,22 @@ DFrameBuffer *CreateGLSWFrameBuffer(int width, int height, bool bgra, bool fullscreen); +CUSTOM_CVAR(Bool, vid_glswfb, true, CVAR_NOINITCALL | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +{ + Printf("This won't take effect until " GAMENAME " is restarted.\n"); +} + EXTERN_CVAR(Bool, ticker ) EXTERN_CVAR(Bool, vid_vsync) EXTERN_CVAR(Bool, vid_hidpi) -CUSTOM_CVAR(Bool, swtruecolor, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +#if defined __ppc__ || defined __ppc64__ +static const bool TRUECOLOR_DEFAULT = false; +#else // other than PowerPC +static const bool TRUECOLOR_DEFAULT = true; +#endif // PowerPC + +CUSTOM_CVAR(Bool, swtruecolor, TRUECOLOR_DEFAULT, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) { // Strictly speaking this doesn't require a mode switch, but it is the easiest // way to force a CreateFramebuffer call without a lot of refactoring. @@ -551,9 +562,12 @@ CocoaVideo::CocoaVideo() // Create OpenGL pixel format - NSOpenGLPixelFormat* pixelFormat = CreatePixelFormat(OpenGLProfile::Core); + const OpenGLProfile defaultProfile = (1 == vid_renderer || vid_glswfb) + ? OpenGLProfile::Core + : OpenGLProfile::Legacy; + NSOpenGLPixelFormat* pixelFormat = CreatePixelFormat(defaultProfile); - if (nil == pixelFormat) + if (nil == pixelFormat && OpenGLProfile::Core == defaultProfile) { pixelFormat = CreatePixelFormat(OpenGLProfile::Legacy); @@ -647,10 +661,20 @@ DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, c { fb = new OpenGLFrameBuffer(NULL, width, height, 32, 60, fullscreen); } + else if (vid_glswfb) + { + fb = CreateGLSWFrameBuffer(width, height, bgra, fullscreen); + + if (!fb->IsValid()) + { + delete fb; + + fb = new CocoaFrameBuffer(width, height, bgra, fullscreen); + } + } else { - //fb = new CocoaFrameBuffer(width, height, bgra, fullscreen); - fb = CreateGLSWFrameBuffer(width, height, bgra, fullscreen); + fb = new CocoaFrameBuffer(width, height, bgra, fullscreen); } fb->SetFlash(flashColor, flashAmount); diff --git a/src/s_sound.cpp b/src/s_sound.cpp index 48139da4e..ccf5f554a 100644 --- a/src/s_sound.cpp +++ b/src/s_sound.cpp @@ -2596,21 +2596,8 @@ bool S_ChangeMusic (const char *musicname, int order, bool looping, bool force) { if ((lumpnum = Wads.CheckNumForFullName (musicname, true, ns_music)) == -1) { - if (strstr(musicname, "://") > musicname) - { - // Looks like a URL; try it as such. - handle = I_RegisterURLSong(musicname); - if (handle == NULL) - { - Printf ("Could not open \"%s\"\n", musicname); - return false; - } - } - else - { - Printf ("Music \"%s\" not found\n", musicname); - return false; - } + Printf ("Music \"%s\" not found\n", musicname); + return false; } if (handle == NULL) { diff --git a/src/scripting/types.cpp b/src/scripting/types.cpp index 82263dfcd..f07356d42 100644 --- a/src/scripting/types.cpp +++ b/src/scripting/types.cpp @@ -705,14 +705,13 @@ PFloat::PFloat(unsigned int size) Flags |= TYPE_Float; if (size == 8) { -#ifdef __i386__ - // According to System V i386 ABI alignment of double type is 4 - // GCC and Clang for 32-bit Intel targets follow this requirement - // However GCC has -malign-double option to enable 8-byte alignment - // So calculation of the actual alignment is needed - struct AlignmentCheck { uint8_t i; double d; }; - Align = static_cast(offsetof(AlignmentCheck, d)); -#endif // __i386__ + if (sizeof(void*) == 4) + { + // Some ABIs for 32-bit platforms define alignment of double type as 4 bytes + // Intel POSIX (System V ABI) and PowerPC Macs are examples of those + struct AlignmentCheck { uint8_t i; double d; }; + Align = static_cast(offsetof(AlignmentCheck, d)); + } SetDoubleSymbols(); } diff --git a/src/sound/i_music.cpp b/src/sound/i_music.cpp index a31c4de74..8ee359e8f 100644 --- a/src/sound/i_music.cpp +++ b/src/sound/i_music.cpp @@ -558,25 +558,6 @@ MusInfo *I_RegisterCDSong (int track, int id) return info; } -//========================================================================== -// -// -// -//========================================================================== - -MusInfo *I_RegisterURLSong (const char *url) -{ - StreamSong *song; - - song = new StreamSong(url); - if (song->IsValid()) - { - return song; - } - delete song; - return NULL; -} - //========================================================================== // // ungzip diff --git a/src/sound/i_music.h b/src/sound/i_music.h index 9939868a7..a803409dc 100644 --- a/src/sound/i_music.h +++ b/src/sound/i_music.h @@ -57,7 +57,6 @@ class MusInfo; struct MidiDeviceSetting; MusInfo *I_RegisterSong (FileReader *reader, MidiDeviceSetting *device); MusInfo *I_RegisterCDSong (int track, int cdid = 0); -MusInfo *I_RegisterURLSong (const char *url); // The base music class. Everything is derived from this -------------------- diff --git a/src/sound/i_musicinterns.h b/src/sound/i_musicinterns.h index 596093a06..531d69ec7 100644 --- a/src/sound/i_musicinterns.h +++ b/src/sound/i_musicinterns.h @@ -594,7 +594,6 @@ class StreamSong : public MusInfo { public: StreamSong (FileReader *reader); - StreamSong (const char *url); ~StreamSong (); void Play (bool looping, int subsong); void Pause (); diff --git a/src/sound/i_sound.cpp b/src/sound/i_sound.cpp index b51cc298b..e0b0dd280 100644 --- a/src/sound/i_sound.cpp +++ b/src/sound/i_sound.cpp @@ -576,11 +576,6 @@ std::pair SoundRenderer::LoadSoundVoc(uint8_t *sfxdata, int le return retval; } -SoundStream *SoundRenderer::OpenStream(const char *url, int flags) -{ - return 0; -} - SoundDecoder *SoundRenderer::CreateDecoder(FileReader *reader) { SoundDecoder *decoder = NULL; diff --git a/src/sound/i_sound.h b/src/sound/i_sound.h index 97d81d9dc..ed6473bd8 100644 --- a/src/sound/i_sound.h +++ b/src/sound/i_sound.h @@ -108,7 +108,6 @@ public: // Streaming sounds. virtual SoundStream *CreateStream (SoundStreamCallback callback, int buffbytes, int flags, int samplerate, void *userdata) = 0; virtual SoundStream *OpenStream (FileReader *reader, int flags) = 0; - virtual SoundStream *OpenStream (const char *url, int flags); // Starts a sound. virtual FISoundChannel *StartSound (SoundHandle sfx, float vol, int pitch, int chanflags, FISoundChannel *reuse_chan) = 0; diff --git a/src/sound/mididevices/music_opl_mididevice.cpp b/src/sound/mididevices/music_opl_mididevice.cpp index d584d3ebd..72ae555a8 100644 --- a/src/sound/mididevices/music_opl_mididevice.cpp +++ b/src/sound/mididevices/music_opl_mididevice.cpp @@ -296,7 +296,7 @@ FString OPLMIDIDevice::GetStats() char star[3] = { TEXTCOLOR_ESCAPE, 'A', '*' }; for (uint32_t i = 0; i < io->NumChannels; ++i) { - if (voices[i].index == -1) + if (voices[i].index == ~0u) { star[1] = CR_BRICK + 'A'; } diff --git a/src/sound/musicformats/music_stream.cpp b/src/sound/musicformats/music_stream.cpp index 2e57bd31b..dbb5a85f2 100644 --- a/src/sound/musicformats/music_stream.cpp +++ b/src/sound/musicformats/music_stream.cpp @@ -90,11 +90,6 @@ StreamSong::StreamSong (FileReader *reader) m_Stream = GSnd->OpenStream (reader, SoundStream::Loop); } -StreamSong::StreamSong (const char *url) -{ - m_Stream = GSnd->OpenStream (url, SoundStream::Loop); -} - bool StreamSong::IsPlaying () { if (m_Status != STATE_Stopped) diff --git a/src/sound/oalsound.cpp b/src/sound/oalsound.cpp index 0f20ab36b..946fba2b5 100644 --- a/src/sound/oalsound.cpp +++ b/src/sound/oalsound.cpp @@ -230,7 +230,7 @@ class OpenALSoundStream : public SoundStream if(Renderer->FreeSfx.Size() == 0) { FSoundChan *lowest = Renderer->FindLowestChannel(); - if(lowest) Renderer->StopChannel(lowest); + if(lowest) Renderer->ForceStopChannel(lowest); if(Renderer->FreeSfx.Size() == 0) return false; @@ -810,6 +810,14 @@ OpenALSoundRenderer::OpenALSoundRenderer() return; } + ALCint refresh=0; + alcGetIntegerv(Device, ALC_REFRESH, 1, &refresh); + if(refresh > 0) + { + // Round up instead of down + UpdateTimeMS = (1000+refresh-1) / refresh; + } + ALCint numMono=0, numStereo=0; alcGetIntegerv(Device, ALC_MONO_SOURCES, 1, &numMono); alcGetIntegerv(Device, ALC_STEREO_SOURCES, 1, &numStereo); @@ -1205,7 +1213,7 @@ std::pair OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int ChannelConfig chans; SampleType type; int srate; - uint32_t loop_start = 0, loop_end = 0; + uint32_t loop_start = 0, loop_end = ~0u; bool startass = false, endass = false; if (!memcmp(sfxdata, "OggS", 4) || !memcmp(sfxdata, "FLAC", 4)) @@ -1282,8 +1290,9 @@ std::pair OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int if (!startass) loop_start = Scale(loop_start, srate, 1000); if (!endass) loop_end = Scale(loop_end, srate, 1000); - if (loop_start < 0) loop_start = 0; - if (loop_end >= data.Size() / samplesize) loop_end = data.Size() / samplesize - 1; + const uint32_t samples = data.Size() / samplesize; + if (loop_start > samples) loop_start = 0; + if (loop_end > samples) loop_end = samples; if ((loop_start > 0 || loop_end > 0) && loop_end > loop_start && AL.SOFT_loop_points) { @@ -1314,7 +1323,7 @@ void OpenALSoundRenderer::UnloadSound(SoundHandle sfx) if((ALuint)bufID == buffer) { FSoundChan *next = schan->NextChan; - StopChannel(schan); + ForceStopChannel(schan); schan = next; continue; } @@ -1322,6 +1331,20 @@ void OpenALSoundRenderer::UnloadSound(SoundHandle sfx) schan = schan->NextChan; } + // Make sure to kill any currently fading sounds too + for(auto iter = FadingSources.begin();iter != FadingSources.end();) + { + ALint bufID = 0; + alGetSourcei(iter->first, AL_BUFFER, &bufID); + if(static_cast(bufID) == buffer) + { + FreeSource(iter->first); + iter = FadingSources.erase(iter); + } + else + ++iter; + } + alDeleteBuffers(1, &buffer); getALError(); } @@ -1358,7 +1381,7 @@ FISoundChannel *OpenALSoundRenderer::StartSound(SoundHandle sfx, float vol, int if(FreeSfx.Size() == 0) { FSoundChan *lowest = FindLowestChannel(); - if(lowest) StopChannel(lowest); + if(lowest) ForceStopChannel(lowest); if(FreeSfx.Size() == 0) return NULL; @@ -1461,7 +1484,7 @@ FISoundChannel *OpenALSoundRenderer::StartSound3D(SoundHandle sfx, SoundListener { if(lowest->Priority < priority || (lowest->Priority == priority && lowest->DistanceSqr > dist_sqr)) - StopChannel(lowest); + ForceStopChannel(lowest); } if(FreeSfx.Size() == 0) return NULL; @@ -1665,16 +1688,8 @@ void OpenALSoundRenderer::ChannelVolume(FISoundChannel *chan, float volume) alSourcef(source, AL_GAIN, SfxVolume * volume); } -void OpenALSoundRenderer::StopChannel(FISoundChannel *chan) +void OpenALSoundRenderer::FreeSource(ALuint source) { - if(chan == NULL || chan->SysChannel == NULL) - return; - - ALuint source = GET_PTRID(chan->SysChannel); - // Release first, so it can be properly marked as evicted if it's being - // forcefully killed - S_ChannelEnded(chan); - alSourceRewind(source); alSourcei(source, AL_BUFFER, 0); getALError(); @@ -1684,11 +1699,48 @@ void OpenALSoundRenderer::StopChannel(FISoundChannel *chan) PausableSfx.Delete(i); if((i=ReverbSfx.Find(source)) < ReverbSfx.Size()) ReverbSfx.Delete(i); + if((i=SfxGroup.Find(source)) < SfxGroup.Size()) + SfxGroup.Delete(i); - SfxGroup.Delete(SfxGroup.Find(source)); FreeSfx.Push(source); } +void OpenALSoundRenderer::StopChannel(FISoundChannel *chan) +{ + if(chan == NULL || chan->SysChannel == NULL) + return; + + ALuint source = GET_PTRID(chan->SysChannel); + // Release first, so it can be properly marked as evicted if it's being killed + S_ChannelEnded(chan); + + ALint state = AL_INITIAL; + alGetSourcei(source, AL_SOURCE_STATE, &state); + if(state != AL_PLAYING) + FreeSource(source); + else + { + // The sound is being killed while playing, so set its gain to 0 and track it + // as it fades. + alSourcef(source, AL_GAIN, 0.f); + getALError(); + + FadingSources.insert(std::make_pair( + source, std::chrono::steady_clock::now().time_since_epoch().count() + )); + } +} + +void OpenALSoundRenderer::ForceStopChannel(FISoundChannel *chan) +{ + ALuint source = GET_PTRID(chan->SysChannel); + if(!source) return; + + S_ChannelEnded(chan); + FreeSource(source); +} + + unsigned int OpenALSoundRenderer::GetPosition(FISoundChannel *chan) { if(chan == NULL || chan->SysChannel == NULL) @@ -1945,6 +1997,23 @@ void OpenALSoundRenderer::UpdateSounds() { alProcessUpdatesSOFT(); + if(!FadingSources.empty()) + { + auto cur_time = std::chrono::steady_clock::now().time_since_epoch(); + for(auto iter = FadingSources.begin();iter != FadingSources.end();) + { + auto time_diff = std::chrono::duration_cast(cur_time - + std::chrono::steady_clock::time_point::duration(iter->second)); + if(time_diff.count() >= UpdateTimeMS) + { + FreeSource(iter->first); + iter = FadingSources.erase(iter); + } + else + ++iter; + } + } + if(ALC.EXT_disconnect) { ALCint connected = ALC_TRUE; @@ -2030,17 +2099,18 @@ void OpenALSoundRenderer::PrintStatus() FString OpenALSoundRenderer::GatherStats() { - ALCint updates = 1; - alcGetIntegerv(Device, ALC_REFRESH, 1, &updates); + FString out; + + ALCint refresh = 1; + alcGetIntegerv(Device, ALC_REFRESH, 1, &refresh); getALCError(Device); uint32_t total = Sources.Size(); uint32_t used = SfxGroup.Size()+Streams.Size(); uint32_t unused = FreeSfx.Size(); - FString out; - out.Format("%u sources (" TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" active, " TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" free), Update interval: " TEXTCOLOR_YELLOW"%d" TEXTCOLOR_NORMAL"ms", - total, used, unused, 1000/updates); + out.Format("%u sources (" TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" active, " TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" free), Update interval: " TEXTCOLOR_YELLOW"%.1f" TEXTCOLOR_NORMAL"ms", + total, used, unused, 1000.f/static_cast(refresh)); return out; } @@ -2105,7 +2175,7 @@ void OpenALSoundRenderer::PurgeStoppedSources() { if(schan->SysChannel != NULL && src == GET_PTRID(schan->SysChannel)) { - StopChannel(schan); + ForceStopChannel(schan); break; } schan = schan->NextChan; diff --git a/src/sound/oalsound.h b/src/sound/oalsound.h index f14d7dd61..82cb4a736 100644 --- a/src/sound/oalsound.h +++ b/src/sound/oalsound.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "i_sound.h" #include "s_sound.h" @@ -200,8 +201,10 @@ private: void RemoveStream(OpenALSoundStream *stream); void LoadReverb(const ReverbContainer *env); + void FreeSource(ALuint source); void PurgeStoppedSources(); static FSoundChan *FindLowestChannel(); + void ForceStopChannel(FISoundChannel *chan); std::thread StreamThread; std::mutex StreamLock; @@ -222,6 +225,10 @@ private: TArray ReverbSfx; TArray SfxGroup; + int UpdateTimeMS; + using SourceTimeMap = std::unordered_map; + SourceTimeMap FadingSources; + const ReverbContainer *PrevEnvironment; typedef TMap EffectMap; diff --git a/src/sound/oplsynth/musicblock.cpp b/src/sound/oplsynth/musicblock.cpp index d905744d4..c2e23b0dc 100644 --- a/src/sound/oplsynth/musicblock.cpp +++ b/src/sound/oplsynth/musicblock.cpp @@ -140,7 +140,7 @@ void musicBlock::voiceKeyOn(uint32_t slot, uint32_t channo, GenMidiInstrument *i // Work out the note to use. This is normally the same as // the key, unless it is a fixed pitch instrument. - uint32_t note; + int note; if (instrument->flags & GENMIDI_FLAG_FIXED) note = instrument->fixed_note; else if (channo == CHAN_PERCUSSION) note = 60; else note = key; @@ -475,6 +475,6 @@ void musicBlock::stopAllVoices() { for (uint32_t i = 0; i < io->NumChannels; i++) { - if (voices[i].index >= 0) releaseVoice(i, 1); + if (voices[i].index != ~0u) releaseVoice(i, 1); } } diff --git a/src/stats.h b/src/stats.h index df61dafb0..86a5b8f1c 100644 --- a/src/stats.h +++ b/src/stats.h @@ -115,13 +115,7 @@ extern "C" unsigned __int64 __rdtsc(void); #pragma intrinsic(__rdtsc) inline unsigned __int64 rdtsc() { -#ifndef _M_X64 - if (CPU.bRDTSC) -#endif - { - return __rdtsc(); - } - return 0; + return __rdtsc(); } #else inline uint64_t rdtsc() diff --git a/src/v_draw.cpp b/src/v_draw.cpp index ea589cbfc..49d23250e 100644 --- a/src/v_draw.cpp +++ b/src/v_draw.cpp @@ -778,7 +778,7 @@ bool DCanvas::ParseDrawTextureTags(FTexture *img, double x, double y, uint32_t t if (parms->lclip < clipleft) parms->lclip = clipleft; if (parms->rclip > clipleft + clipwidth) parms->rclip = clipleft + clipwidth; if (parms->uclip < cliptop) parms->uclip = cliptop; - if (parms->dclip < cliptop + clipheight) parms->uclip = cliptop + clipheight; + if (parms->dclip > cliptop + clipheight) parms->dclip = cliptop + clipheight; } if (parms->uclip >= parms->dclip || parms->lclip >= parms->rclip) diff --git a/src/v_palette.cpp b/src/v_palette.cpp index 606826d13..11e3f6945 100644 --- a/src/v_palette.cpp +++ b/src/v_palette.cpp @@ -410,7 +410,6 @@ void InitPalette () R_InitColormaps (); } -void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) @@ -435,37 +434,17 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in return; } #if defined(_M_X64) || defined(_M_IX86) || defined(__i386__) || defined(__amd64__) - else if (CPU.bSSE2) + else if (count >= 4) { - if (count >= 4) + int not3count = count & ~3; + DoBlending_SSE2 (from, to, not3count, r, g, b, a); + count &= 3; + if (count <= 0) { - int not3count = count & ~3; - DoBlending_SSE2 (from, to, not3count, r, g, b, a); - count &= 3; - if (count <= 0) - { - return; - } - from += not3count; - to += not3count; - } - } -#endif -#if defined(_M_IX86) || defined(__i386__) - else if (CPU.bMMX) - { - if (count >= 4) - { - int not3count = count & ~3; - DoBlending_MMX (from, to, not3count, r, g, b, a); - count &= 3; - if (count <= 0) - { - return; - } - from += not3count; - to += not3count; + return; } + from += not3count; + to += not3count; } #endif int i, ia; diff --git a/src/version.h b/src/version.h index ef25c91e6..1314450ce 100644 --- a/src/version.h +++ b/src/version.h @@ -31,11 +31,13 @@ ** */ -#include "gitinfo.h" - #ifndef __VERSION_H__ #define __VERSION_H__ +#ifdef _WIN32 +#include "gitinfo.h" +#endif // _WIN32 + const char *GetGitDescription(); const char *GetGitHash(); const char *GetGitTime(); diff --git a/src/win32/i_system.cpp b/src/win32/i_system.cpp index 003e95662..609be2301 100644 --- a/src/win32/i_system.cpp +++ b/src/win32/i_system.cpp @@ -707,7 +707,7 @@ void CalculateCPUSpeed() QueryPerformanceFrequency (&freq); - if (freq.QuadPart != 0 && CPU.bRDTSC) + if (freq.QuadPart != 0) { LARGE_INTEGER count1, count2; cycle_t ClockCalibration; diff --git a/src/win32/win32gliface.cpp b/src/win32/win32gliface.cpp index 3d4bca30a..614adaa2d 100644 --- a/src/win32/win32gliface.cpp +++ b/src/win32/win32gliface.cpp @@ -167,7 +167,7 @@ public: Win32GLVideo::Win32GLVideo(int parm) : m_Modes(NULL), m_IsFullscreen(false) { #ifdef _WIN32 - if (CPU.bRDTSC) gl_CalculateCPUSpeed(); + gl_CalculateCPUSpeed(); #endif I_SetWndProc(); m_DisplayWidth = vid_defwidth; diff --git a/src/x86.cpp b/src/x86.cpp index 20df06aa1..b8c2d01d0 100644 --- a/src/x86.cpp +++ b/src/x86.cpp @@ -244,9 +244,6 @@ void DumpCPUInfo(const CPUInfo *cpu) cpu->Family, cpu->Model, cpu->Stepping); } Printf(" Features:"); - if (cpu->bMMX) Printf(" MMX"); - if (cpu->bMMXPlus) Printf(" MMX+"); - if (cpu->bSSE) Printf(" SSE"); if (cpu->bSSE2) Printf(" SSE2"); if (cpu->bSSE3) Printf(" SSE3"); if (cpu->bSSSE3) Printf(" SSSE3"); @@ -258,50 +255,6 @@ void DumpCPUInfo(const CPUInfo *cpu) } } -#if !defined(__amd64__) && !defined(_M_X64) - -void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) -{ - __m64 blendcolor; - __m64 blendalpha; - __m64 zero; - __m64 blending256; - __m64 color1; - __m64 color2; - - zero = _mm_setzero_si64(); -#ifndef __GNUC__ - blending256.m64_i64 = 0x10001000100; -#else - blending256 = (__m64)0x10001000100ll; -#endif - - blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero); // 000000RR 00GG00BB - blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero); // 000000AA 00AA00AA - - blendcolor = _mm_mullo_pi16(blendcolor, blendalpha); // premultiply blend by alpha - blendalpha = _mm_subs_pu16(blending256, blendalpha); // one minus alpha - - // Do two colors per iteration: Count must be even - for (count >>= 1; count > 0; --count) - { - color1 = *(__m64 *)from; // 00r2g2b2 00r1g1b1 - from += 2; - color2 = _mm_unpackhi_pi8(color1, zero); // 000000r2 00g200b2 - color1 = _mm_unpacklo_pi8(color1, zero); // 000000r1 00g100b1 - color1 = _mm_mullo_pi16(blendalpha, color1); // 0000r1rr g1ggb1bb - color2 = _mm_mullo_pi16(blendalpha, color2); // 0000r2rr g2ggb2bb - color1 = _mm_adds_pu16(blendcolor, color1); - color2 = _mm_adds_pu16(blendcolor, color2); - color1 = _mm_srli_pi16(color1, 8); - color2 = _mm_srli_pi16(color2, 8); - *(__m64 *)to = _mm_packs_pu16(color1, color2); // 00r2g2b2 00r1g1b1 - to += 2; - } - _mm_empty(); -} -#endif - void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { diff --git a/strifehelp.acs b/wadsrc/static/strifehelp.acs similarity index 100% rename from strifehelp.acs rename to wadsrc/static/strifehelp.acs