mirror of
https://github.com/ZDoom/gzdoom-gles.git
synced 2024-11-10 23:01:59 +00:00
This commit is contained in:
commit
88f8c4afcc
47 changed files with 499 additions and 1350 deletions
|
@ -10,7 +10,7 @@ if( COMMAND cmake_policy )
|
|||
endif()
|
||||
endif()
|
||||
|
||||
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} )
|
||||
list( APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake )
|
||||
include( CreateLaunchers )
|
||||
include( FindPackageHandleStandardArgs )
|
||||
|
||||
|
@ -148,8 +148,6 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
|
|||
set( PROFILE 0 CACHE BOOL "Enable profiling with gprof for Debug and RelWithDebInfo build types." )
|
||||
endif()
|
||||
|
||||
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
|
||||
|
||||
option( NO_OPENAL "Disable OpenAL sound support" OFF )
|
||||
|
||||
find_package( BZip2 )
|
||||
|
|
157
cmake/TargetArch.cmake
Normal file
157
cmake/TargetArch.cmake
Normal file
|
@ -0,0 +1,157 @@
|
|||
|
||||
# Copyright (c) 2012 Petroules Corporation. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation and/or
|
||||
# other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
# IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
||||
# OF SUCH DAMAGE.
|
||||
|
||||
# Based on the Qt 5 processor detection code, so should be very accurate
|
||||
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
|
||||
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
|
||||
|
||||
# Regarding POWER/PowerPC, just as is noted in the Qt source,
|
||||
# "There are many more known variants/revisions that we do not handle/detect."
|
||||
|
||||
set(archdetect_c_code "
|
||||
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
|
||||
#if defined(__ARM_ARCH_7__) \\
|
||||
|| defined(__ARM_ARCH_7A__) \\
|
||||
|| defined(__ARM_ARCH_7R__) \\
|
||||
|| defined(__ARM_ARCH_7M__) \\
|
||||
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
|
||||
#error cmake_ARCH armv7
|
||||
#elif defined(__ARM_ARCH_6__) \\
|
||||
|| defined(__ARM_ARCH_6J__) \\
|
||||
|| defined(__ARM_ARCH_6T2__) \\
|
||||
|| defined(__ARM_ARCH_6Z__) \\
|
||||
|| defined(__ARM_ARCH_6K__) \\
|
||||
|| defined(__ARM_ARCH_6ZK__) \\
|
||||
|| defined(__ARM_ARCH_6M__) \\
|
||||
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
|
||||
#error cmake_ARCH armv6
|
||||
#elif defined(__ARM_ARCH_5TEJ__) \\
|
||||
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
|
||||
#error cmake_ARCH armv5
|
||||
#else
|
||||
#error cmake_ARCH arm
|
||||
#endif
|
||||
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
|
||||
#error cmake_ARCH i386
|
||||
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
|
||||
#error cmake_ARCH x86_64
|
||||
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
|
||||
#error cmake_ARCH ia64
|
||||
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|
||||
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|
||||
|| defined(_M_MPPC) || defined(_M_PPC)
|
||||
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
|
||||
#error cmake_ARCH ppc64
|
||||
#else
|
||||
#error cmake_ARCH ppc
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#error cmake_ARCH unknown
|
||||
")
|
||||
|
||||
# Set ppc_support to TRUE before including this file or ppc and ppc64
|
||||
# will be treated as invalid architectures since they are no longer supported by Apple
|
||||
|
||||
function(target_architecture output_var)
|
||||
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
|
||||
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
|
||||
# First let's normalize the order of the values
|
||||
|
||||
# Note that it's not possible to compile PowerPC applications if you are using
|
||||
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
|
||||
# disable it by default
|
||||
# See this page for more information:
|
||||
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
|
||||
|
||||
# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
|
||||
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
|
||||
|
||||
foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
|
||||
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
|
||||
set(osx_arch_ppc TRUE)
|
||||
elseif("${osx_arch}" STREQUAL "i386")
|
||||
set(osx_arch_i386 TRUE)
|
||||
elseif("${osx_arch}" STREQUAL "x86_64")
|
||||
set(osx_arch_x86_64 TRUE)
|
||||
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
|
||||
set(osx_arch_ppc64 TRUE)
|
||||
else()
|
||||
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Now add all the architectures in our normalized order
|
||||
if(osx_arch_ppc)
|
||||
list(APPEND ARCH ppc)
|
||||
endif()
|
||||
|
||||
if(osx_arch_i386)
|
||||
list(APPEND ARCH i386)
|
||||
endif()
|
||||
|
||||
if(osx_arch_x86_64)
|
||||
list(APPEND ARCH x86_64)
|
||||
endif()
|
||||
|
||||
if(osx_arch_ppc64)
|
||||
list(APPEND ARCH ppc64)
|
||||
endif()
|
||||
else()
|
||||
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
|
||||
|
||||
enable_language(C)
|
||||
|
||||
# Detect the architecture in a rather creative way...
|
||||
# This compiles a small C program which is a series of ifdefs that selects a
|
||||
# particular #error preprocessor directive whose message string contains the
|
||||
# target architecture. The program will always fail to compile (both because
|
||||
# file is not a valid C program, and obviously because of the presence of the
|
||||
# #error preprocessor directives... but by exploiting the preprocessor in this
|
||||
# way, we can detect the correct target architecture even when cross-compiling,
|
||||
# since the program itself never needs to be run (only the compiler/preprocessor)
|
||||
try_run(
|
||||
run_result_unused
|
||||
compile_result_unused
|
||||
"${CMAKE_BINARY_DIR}"
|
||||
"${CMAKE_BINARY_DIR}/arch.c"
|
||||
COMPILE_OUTPUT_VARIABLE ARCH
|
||||
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
|
||||
)
|
||||
|
||||
# Parse the architecture name from the compiler output
|
||||
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
|
||||
|
||||
# Get rid of the value marker leaving just the architecture name
|
||||
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
|
||||
|
||||
# If we are compiling with an unknown architecture this variable should
|
||||
# already be set to "unknown" but in the case that it's empty (i.e. due
|
||||
# to a typo in the code), then set it to unknown
|
||||
if (NOT ARCH)
|
||||
set(ARCH unknown)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(${output_var} "${ARCH}" PARENT_SCOPE)
|
||||
endfunction()
|
|
@ -1,6 +1,6 @@
|
|||
cmake_minimum_required( VERSION 2.8.7 )
|
||||
|
||||
include(../precompiled_headers.cmake)
|
||||
include(precompiled_headers)
|
||||
|
||||
if( COMMAND cmake_policy )
|
||||
cmake_policy( SET CMP0003 NEW )
|
||||
|
@ -13,6 +13,7 @@ include( CheckIncludeFile )
|
|||
include( CheckIncludeFiles )
|
||||
include( CheckLibraryExists )
|
||||
include( FindPkgConfig )
|
||||
include( TargetArch )
|
||||
|
||||
if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
|
||||
option( NO_STRIP "Do not strip Release or MinSizeRel builds" )
|
||||
|
@ -32,7 +33,9 @@ if( APPLE )
|
|||
option( OSX_COCOA_BACKEND "Use native Cocoa backend instead of SDL" ON )
|
||||
endif()
|
||||
|
||||
if( CMAKE_SIZEOF_VOID_P MATCHES "8" )
|
||||
target_architecture(ZDOOM_TARGET_ARCH)
|
||||
|
||||
if( ${ZDOOM_TARGET_ARCH} MATCHES "x86_64" )
|
||||
set( X64 64 )
|
||||
endif()
|
||||
|
||||
|
@ -347,6 +350,11 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if( NOT X64 AND NOT CAN_DO_MFPMATH )
|
||||
set( CMAKE_C_FLAGS "-DNO_SSE ${CMAKE_CXX_FLAGS}" )
|
||||
set( CMAKE_CXX_FLAGS "-DNO_SSE ${CMAKE_CXX_FLAGS}" )
|
||||
endif()
|
||||
|
||||
# Use the highest C++ standard available since VS2015 compiles with C++14
|
||||
# but we only require C++11. The recommended way to do this in CMake is to
|
||||
# probably to use target_compile_features, but I don't feel like maintaining
|
||||
|
|
|
@ -14,11 +14,7 @@ extern double gl_MillisecPerCycle;
|
|||
|
||||
__forceinline int64_t GetClockCycle ()
|
||||
{
|
||||
#if _M_X64
|
||||
return __rdtsc();
|
||||
#else
|
||||
return CPU.bRDTSC ? __rdtsc() : 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif defined __APPLE__ && (defined __i386__ || defined __x86_64__)
|
||||
|
|
|
@ -1,739 +0,0 @@
|
|||
/*
|
||||
** Projected triangle drawer
|
||||
** Copyright (c) 2016 Magnus Norddahl
|
||||
**
|
||||
** This software is provided 'as-is', without any express or implied
|
||||
** warranty. In no event will the authors be held liable for any damages
|
||||
** arising from the use of this software.
|
||||
**
|
||||
** Permission is granted to anyone to use this software for any purpose,
|
||||
** including commercial applications, and to alter it and redistribute it
|
||||
** freely, subject to the following restrictions:
|
||||
**
|
||||
** 1. The origin of this software must not be misrepresented; you must not
|
||||
** claim that you wrote the original software. If you use this software
|
||||
** in a product, an acknowledgment in the product documentation would be
|
||||
** appreciated but is not required.
|
||||
** 2. Altered source versions must be plainly marked as such, and must not be
|
||||
** misrepresented as being the original software.
|
||||
** 3. This notice may not be removed or altered from any source distribution.
|
||||
**
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "screen_triangle.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4752) // warning C4752 : found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
|
||||
#endif
|
||||
|
||||
namespace TriScreenDrawerModes
|
||||
{
|
||||
template<typename SamplerT, typename FilterModeT>
|
||||
FORCEINLINE unsigned int VECTORCALL Sample32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation)
|
||||
{
|
||||
uint32_t texel;
|
||||
if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz)
|
||||
{
|
||||
return color;
|
||||
}
|
||||
else if (SamplerT::Mode == (int)Samplers::Translated)
|
||||
{
|
||||
const uint8_t *texpal = (const uint8_t *)texPixels;
|
||||
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
|
||||
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
|
||||
return translation[texpal[texelX * texHeight + texelY]];
|
||||
}
|
||||
else if (FilterModeT::Mode == (int)FilterModes::Nearest)
|
||||
{
|
||||
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
|
||||
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
|
||||
texel = texPixels[texelX * texHeight + texelY];
|
||||
}
|
||||
else
|
||||
{
|
||||
u -= oneU >> 1;
|
||||
v -= oneV >> 1;
|
||||
|
||||
unsigned int frac_x0 = (((uint32_t)u << 8) >> FRACBITS) * texWidth;
|
||||
unsigned int frac_x1 = ((((uint32_t)u << 8) + oneU) >> FRACBITS) * texWidth;
|
||||
unsigned int frac_y0 = (((uint32_t)v << 8) >> FRACBITS) * texHeight;
|
||||
unsigned int frac_y1 = ((((uint32_t)v << 8) + oneV) >> FRACBITS) * texHeight;
|
||||
unsigned int x0 = frac_x0 >> FRACBITS;
|
||||
unsigned int x1 = frac_x1 >> FRACBITS;
|
||||
unsigned int y0 = frac_y0 >> FRACBITS;
|
||||
unsigned int y1 = frac_y1 >> FRACBITS;
|
||||
|
||||
unsigned int p00 = texPixels[x0 * texHeight + y0];
|
||||
unsigned int p01 = texPixels[x0 * texHeight + y1];
|
||||
unsigned int p10 = texPixels[x1 * texHeight + y0];
|
||||
unsigned int p11 = texPixels[x1 * texHeight + y1];
|
||||
|
||||
unsigned int inv_a = (frac_x1 >> (FRACBITS - 4)) & 15;
|
||||
unsigned int inv_b = (frac_y1 >> (FRACBITS - 4)) & 15;
|
||||
unsigned int a = 16 - inv_a;
|
||||
unsigned int b = 16 - inv_b;
|
||||
|
||||
unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8;
|
||||
unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8;
|
||||
unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8;
|
||||
unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8;
|
||||
|
||||
texel = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue;
|
||||
}
|
||||
|
||||
if (SamplerT::Mode == (int)Samplers::Skycap)
|
||||
{
|
||||
int start_fade = 2; // How fast it should fade out
|
||||
|
||||
int alpha_top = clamp(v >> (16 - start_fade), 0, 256);
|
||||
int alpha_bottom = clamp(((2 << 24) - v) >> (16 - start_fade), 0, 256);
|
||||
int a = MIN(alpha_top, alpha_bottom);
|
||||
int inv_a = 256 - a;
|
||||
|
||||
uint32_t r = RPART(texel);
|
||||
uint32_t g = GPART(texel);
|
||||
uint32_t b = BPART(texel);
|
||||
uint32_t fg_a = APART(texel);
|
||||
uint32_t bg_red = RPART(color);
|
||||
uint32_t bg_green = GPART(color);
|
||||
uint32_t bg_blue = BPART(color);
|
||||
r = (r * a + bg_red * inv_a + 127) >> 8;
|
||||
g = (g * a + bg_green * inv_a + 127) >> 8;
|
||||
b = (b * a + bg_blue * inv_a + 127) >> 8;
|
||||
return MAKEARGB(fg_a, r, g, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
return texel;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename SamplerT>
|
||||
FORCEINLINE unsigned int VECTORCALL SampleShade32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos)
|
||||
{
|
||||
if (SamplerT::Mode == (int)Samplers::Shaded)
|
||||
{
|
||||
const uint8_t *texpal = (const uint8_t *)texPixels;
|
||||
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
|
||||
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
|
||||
unsigned int sampleshadeout = texpal[texelX * texHeight + texelY];
|
||||
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
|
||||
return sampleshadeout;
|
||||
}
|
||||
else if (SamplerT::Mode == (int)Samplers::Stencil)
|
||||
{
|
||||
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
|
||||
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
|
||||
unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]);
|
||||
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
|
||||
return sampleshadeout;
|
||||
}
|
||||
else if (SamplerT::Mode == (int)Samplers::Fuzz)
|
||||
{
|
||||
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
|
||||
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
|
||||
unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]);
|
||||
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
|
||||
sampleshadeout = (sampleshadeout * fuzzcolormap[fuzzpos++]) >> 5;
|
||||
if (fuzzpos >= FUZZTABLE) fuzzpos = 0;
|
||||
return sampleshadeout;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ShadeModeT>
|
||||
FORCEINLINE __m256i VECTORCALL Shade32_AVX2(__m256i fgcolor, __m256i mlight, __m256i desaturate, __m256i inv_desaturate, __m256i shade_fade, __m256i shade_light)
|
||||
{
|
||||
if (ShadeModeT::Mode == (int)ShadeMode::Simple)
|
||||
{
|
||||
fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, mlight), 8);
|
||||
}
|
||||
else if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
|
||||
{
|
||||
__m256i intensity = _mm256_mullo_epi16(fgcolor, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37));
|
||||
intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 32));
|
||||
intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 16));
|
||||
intensity = _mm256_srli_epi16(intensity, 8);
|
||||
intensity = _mm256_mullo_epi16(intensity, desaturate);
|
||||
intensity = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(intensity, _MM_SHUFFLE(3, 0, 0, 0)), _MM_SHUFFLE(3, 0, 0, 0));
|
||||
|
||||
fgcolor = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(fgcolor, inv_desaturate), intensity), 8);
|
||||
fgcolor = _mm256_mullo_epi16(fgcolor, mlight);
|
||||
fgcolor = _mm256_srli_epi16(_mm256_add_epi16(shade_fade, fgcolor), 8);
|
||||
fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, shade_light), 8);
|
||||
}
|
||||
return fgcolor;
|
||||
}
|
||||
|
||||
template<typename BlendT>
|
||||
FORCEINLINE __m256i VECTORCALL Blend32_AVX2(__m256i fgcolor, __m256i bgcolor, __m256i ifgcolor, __m256i ifgshade, __m256i srcalpha, __m256i destalpha)
|
||||
{
|
||||
if (BlendT::Mode == (int)BlendModes::Opaque)
|
||||
{
|
||||
__m256i outcolor = fgcolor;
|
||||
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
|
||||
return outcolor;
|
||||
}
|
||||
else if (BlendT::Mode == (int)BlendModes::Masked)
|
||||
{
|
||||
__m256i mask = _mm256_cmpeq_epi32(_mm256_packus_epi16(fgcolor, _mm256_setzero_si256()), _mm256_setzero_si256());
|
||||
mask = _mm256_unpacklo_epi8(mask, _mm256_setzero_si256());
|
||||
__m256i outcolor = _mm256_or_si256(_mm256_and_si256(mask, bgcolor), _mm256_andnot_si256(mask, fgcolor));
|
||||
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
|
||||
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
|
||||
return outcolor;
|
||||
}
|
||||
else if (BlendT::Mode == (int)BlendModes::AddSrcColorOneMinusSrcColor)
|
||||
{
|
||||
__m256i inv_srccolor = _mm256_sub_epi16(_mm256_set1_epi16(256), _mm256_add_epi16(fgcolor, _mm256_srli_epi16(fgcolor, 7)));
|
||||
__m256i outcolor = _mm256_add_epi16(fgcolor, _mm256_srli_epi16(_mm256_mullo_epi16(bgcolor, inv_srccolor), 8));
|
||||
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
|
||||
return outcolor;
|
||||
}
|
||||
else if (BlendT::Mode == (int)BlendModes::Shaded)
|
||||
{
|
||||
ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8);
|
||||
__m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha);
|
||||
|
||||
fgcolor = _mm256_mullo_epi16(fgcolor, alpha);
|
||||
bgcolor = _mm256_mullo_epi16(bgcolor, inv_alpha);
|
||||
__m256i outcolor = _mm256_srli_epi16(_mm256_add_epi16(fgcolor, bgcolor), 8);
|
||||
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
|
||||
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
|
||||
return outcolor;
|
||||
}
|
||||
else if (BlendT::Mode == (int)BlendModes::AddClampShaded)
|
||||
{
|
||||
ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8);
|
||||
__m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha);
|
||||
|
||||
fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, alpha), 8);
|
||||
__m256i outcolor = _mm256_add_epi16(fgcolor, bgcolor);
|
||||
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
|
||||
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
|
||||
return outcolor;
|
||||
}
|
||||
else
|
||||
{
|
||||
__m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
|
||||
alpha = _mm256_srli_epi16(_mm256_add_epi16(alpha, _mm256_srli_epi16(alpha, 7)), 1); // 255->128
|
||||
__m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(128), alpha);
|
||||
|
||||
__m256i bgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(destalpha, alpha), _mm256_slli_epi16(inv_alpha, 8)), _mm256_set1_epi32(64)), 7);
|
||||
__m256i fgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(srcalpha, alpha), _mm256_set1_epi32(64)), 7);
|
||||
|
||||
fgcolor = _mm256_mullo_epi16(fgcolor, fgalpha);
|
||||
bgcolor = _mm256_mullo_epi16(bgcolor, bgalpha);
|
||||
|
||||
__m256i fg_lo = _mm256_unpacklo_epi16(fgcolor, _mm256_setzero_si256());
|
||||
__m256i bg_lo = _mm256_unpacklo_epi16(bgcolor, _mm256_setzero_si256());
|
||||
__m256i fg_hi = _mm256_unpackhi_epi16(fgcolor, _mm256_setzero_si256());
|
||||
__m256i bg_hi = _mm256_unpackhi_epi16(bgcolor, _mm256_setzero_si256());
|
||||
|
||||
__m256i out_lo, out_hi;
|
||||
if (BlendT::Mode == (int)BlendModes::AddClamp)
|
||||
{
|
||||
out_lo = _mm256_add_epi32(fg_lo, bg_lo);
|
||||
out_hi = _mm256_add_epi32(fg_hi, bg_hi);
|
||||
}
|
||||
else if (BlendT::Mode == (int)BlendModes::SubClamp)
|
||||
{
|
||||
out_lo = _mm256_sub_epi32(fg_lo, bg_lo);
|
||||
out_hi = _mm256_sub_epi32(fg_hi, bg_hi);
|
||||
}
|
||||
else if (BlendT::Mode == (int)BlendModes::RevSubClamp)
|
||||
{
|
||||
out_lo = _mm256_sub_epi32(bg_lo, fg_lo);
|
||||
out_hi = _mm256_sub_epi32(bg_hi, fg_hi);
|
||||
}
|
||||
|
||||
out_lo = _mm256_srai_epi32(out_lo, 8);
|
||||
out_hi = _mm256_srai_epi32(out_hi, 8);
|
||||
__m256i outcolor = _mm256_packs_epi32(out_lo, out_hi);
|
||||
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
|
||||
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
|
||||
return outcolor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename BlendT, typename SamplerT>
|
||||
class TriScreenDrawer32_AVX2
|
||||
{
|
||||
public:
|
||||
static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
|
||||
{
|
||||
using namespace TriScreenDrawerModes;
|
||||
|
||||
bool is_simple_shade = args->uniforms->SimpleShade();
|
||||
|
||||
if (SamplerT::Mode == (int)Samplers::Texture)
|
||||
{
|
||||
bool is_nearest_filter = args->uniforms->NearestFilter();
|
||||
|
||||
if (is_simple_shade)
|
||||
{
|
||||
if (is_nearest_filter)
|
||||
DrawBlock<SimpleShade, NearestFilter>(x, y, mask0, mask1, args);
|
||||
else
|
||||
DrawBlock<SimpleShade, LinearFilter>(x, y, mask0, mask1, args);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is_nearest_filter)
|
||||
DrawBlock<AdvancedShade, NearestFilter>(x, y, mask0, mask1, args);
|
||||
else
|
||||
DrawBlock<AdvancedShade, LinearFilter>(x, y, mask0, mask1, args);
|
||||
}
|
||||
}
|
||||
else if (SamplerT::Mode == (int)Samplers::Fuzz)
|
||||
{
|
||||
DrawBlock<NoShade, NearestFilter>(x, y, mask0, mask1, args);
|
||||
}
|
||||
else // no linear filtering for translated, shaded, stencil, fill or skycap
|
||||
{
|
||||
if (is_simple_shade)
|
||||
{
|
||||
DrawBlock<SimpleShade, NearestFilter>(x, y, mask0, mask1, args);
|
||||
}
|
||||
else
|
||||
{
|
||||
DrawBlock<AdvancedShade, NearestFilter>(x, y, mask0, mask1, args);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template<typename ShadeModeT, typename FilterModeT>
|
||||
FORCEINLINE static void VECTORCALL DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
|
||||
{
|
||||
using namespace TriScreenDrawerModes;
|
||||
|
||||
bool is_fixed_light = args->uniforms->FixedLight();
|
||||
__m128i lightmask = _mm_set1_epi32(is_fixed_light ? 0 : 0xffffffff);
|
||||
__m256i srcalpha = _mm256_set1_epi16(args->uniforms->SrcAlpha());
|
||||
__m256i destalpha = _mm256_set1_epi16(args->uniforms->DestAlpha());
|
||||
|
||||
int fuzzpos = (ScreenTriangle::FuzzStart + destX * 123 + destY) % FUZZTABLE;
|
||||
|
||||
// Light
|
||||
uint32_t light = args->uniforms->Light();
|
||||
float shade = MIN(2.0f - (light + 12.0f) / 128.0f, 31.0f / 32.0f);
|
||||
float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f);
|
||||
light += (light >> 7); // 255 -> 256
|
||||
light <<= 8;
|
||||
__m128i fixedlight = _mm_set1_epi32(light);
|
||||
|
||||
// Calculate gradients
|
||||
const TriVertex &v1 = *args->v1;
|
||||
__m128 gradientX = _mm_setr_ps(args->gradientX.W, args->gradientX.U, args->gradientX.V, 0.0f);
|
||||
__m128 gradientY = _mm_setr_ps(args->gradientY.W, args->gradientY.U, args->gradientY.V, 0.0f);
|
||||
__m128 blockPosY = _mm_add_ps(_mm_add_ps(
|
||||
_mm_setr_ps(v1.w, v1.u * v1.w, v1.v * v1.w, globVis),
|
||||
_mm_mul_ps(gradientX, _mm_set1_ps(destX - v1.x))),
|
||||
_mm_mul_ps(gradientY, _mm_set1_ps(destY - v1.y)));
|
||||
gradientX = _mm_mul_ps(gradientX, _mm_set1_ps(8.0f));
|
||||
|
||||
// Output
|
||||
uint32_t * RESTRICT destOrg = (uint32_t*)args->dest;
|
||||
int pitch = args->pitch;
|
||||
uint32_t *dest = destOrg + destX + destY * pitch;
|
||||
int offset_next_line = pitch - 8;
|
||||
|
||||
// Sampling stuff
|
||||
uint32_t color = args->uniforms->Color();
|
||||
const uint32_t * RESTRICT translation = (const uint32_t *)args->uniforms->Translation();
|
||||
const uint32_t * RESTRICT texPixels = (const uint32_t *)args->uniforms->TexturePixels();
|
||||
uint32_t texWidth = args->uniforms->TextureWidth();
|
||||
uint32_t texHeight = args->uniforms->TextureHeight();
|
||||
uint32_t oneU, oneV;
|
||||
if (SamplerT::Mode != (int)Samplers::Fill)
|
||||
{
|
||||
oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1;
|
||||
oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
oneU = 0;
|
||||
oneV = 0;
|
||||
}
|
||||
|
||||
// Shade constants
|
||||
__m256i inv_desaturate, shade_fade, shade_light;
|
||||
__m256i desaturate;
|
||||
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
|
||||
{
|
||||
inv_desaturate = _mm256_setr_epi16(
|
||||
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(),
|
||||
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(),
|
||||
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(),
|
||||
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate());
|
||||
shade_fade = _mm256_set_epi16(
|
||||
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(),
|
||||
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(),
|
||||
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(),
|
||||
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue());
|
||||
shade_light = _mm256_set_epi16(
|
||||
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(),
|
||||
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(),
|
||||
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(),
|
||||
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue());
|
||||
desaturate = _mm256_sub_epi16(_mm256_set1_epi16(256), inv_desaturate);
|
||||
}
|
||||
else
|
||||
{
|
||||
inv_desaturate = _mm256_setzero_si256();
|
||||
shade_fade = _mm256_setzero_si256();
|
||||
shade_fade = _mm256_setzero_si256();
|
||||
shade_light = _mm256_setzero_si256();
|
||||
desaturate = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
if (mask0 == 0xffffffff && mask1 == 0xffffffff)
|
||||
{
|
||||
for (int y = 0; y < 8; y++)
|
||||
{
|
||||
__m128 blockPosX = _mm_add_ps(blockPosY, gradientX);
|
||||
__m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W);
|
||||
__m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW));
|
||||
|
||||
__m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W);
|
||||
__m128i lightpospair = _mm_sub_epi32(
|
||||
_mm_set1_epi32(FRACUNIT),
|
||||
_mm_cvtps_epi32(_mm_mul_ps(
|
||||
_mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()),
|
||||
_mm_set1_ps((float)FRACUNIT))));
|
||||
lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight));
|
||||
|
||||
int32_t posU = _mm_cvtsi128_si32(posUV);
|
||||
int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4));
|
||||
int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8));
|
||||
int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12));
|
||||
int32_t lightpos = _mm_cvtsi128_si32(lightpospair);
|
||||
int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8));
|
||||
int32_t stepU = (nextU - posU) >> 3;
|
||||
int32_t stepV = (nextV - posV) >> 3;
|
||||
fixed_t lightstep = (lightnext - lightpos) >> 3;
|
||||
|
||||
for (int ix = 0; ix < 2; ix++)
|
||||
{
|
||||
// Load bgcolor
|
||||
__m256i bgcolor;
|
||||
if (BlendT::Mode != (int)BlendModes::Opaque)
|
||||
{
|
||||
__m128i bgpacked = _mm_loadu_si128((__m128i*)dest);
|
||||
bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128()));
|
||||
}
|
||||
else
|
||||
bgcolor = _mm256_setzero_si256();
|
||||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor0 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade0 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor1 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade1 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor2 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade2 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor3 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade3 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
// Setup light
|
||||
int lightpos0 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos1 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos2 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos3 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
__m256i mlight = _mm256_set_epi16(
|
||||
256, lightpos3, lightpos3, lightpos3,
|
||||
256, lightpos2, lightpos2, lightpos2,
|
||||
256, lightpos1, lightpos1, lightpos1,
|
||||
256, lightpos0, lightpos0, lightpos0);
|
||||
|
||||
__m256i shade_fade_lit;
|
||||
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
|
||||
{
|
||||
__m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight);
|
||||
shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light);
|
||||
}
|
||||
else
|
||||
{
|
||||
shade_fade_lit = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
// Shade and blend
|
||||
__m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0);
|
||||
__m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0);
|
||||
__m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128()));
|
||||
__m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked));
|
||||
__m256i fgcolor = mifgcolor;
|
||||
fgcolor = Shade32_AVX2<ShadeModeT>(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m256i outcolor = Blend32_AVX2<BlendT>(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storeu_si128((__m128i*)dest, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8)));
|
||||
dest += 4;
|
||||
}
|
||||
|
||||
blockPosY = _mm_add_ps(blockPosY, gradientY);
|
||||
|
||||
dest += offset_next_line;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// mask0 loop:
|
||||
for (int y = 0; y < 4; y++)
|
||||
{
|
||||
__m128 blockPosX = _mm_add_ps(blockPosY, gradientX);
|
||||
__m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W);
|
||||
__m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW));
|
||||
|
||||
__m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W);
|
||||
__m128i lightpospair = _mm_sub_epi32(
|
||||
_mm_set1_epi32(FRACUNIT),
|
||||
_mm_cvtps_epi32(_mm_mul_ps(
|
||||
_mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()),
|
||||
_mm_set1_ps((float)FRACUNIT))));
|
||||
lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight));
|
||||
|
||||
int32_t posU = _mm_cvtsi128_si32(posUV);
|
||||
int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4));
|
||||
int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8));
|
||||
int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12));
|
||||
int32_t lightpos = _mm_cvtsi128_si32(lightpospair);
|
||||
int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8));
|
||||
int32_t stepU = (nextU - posU) >> 3;
|
||||
int32_t stepV = (nextV - posV) >> 3;
|
||||
fixed_t lightstep = (lightnext - lightpos) >> 3;
|
||||
|
||||
for (int x = 0; x < 2; x++)
|
||||
{
|
||||
// Load bgcolor
|
||||
uint32_t desttmp[4];
|
||||
__m256i bgcolor;
|
||||
if (BlendT::Mode != (int)BlendModes::Opaque)
|
||||
{
|
||||
if (mask0 & (1 << 31)) desttmp[0] = dest[0];
|
||||
if (mask0 & (1 << 30)) desttmp[1] = dest[1];
|
||||
if (mask0 & (1 << 29)) desttmp[2] = dest[2];
|
||||
if (mask0 & (1 << 28)) desttmp[3] = dest[3];
|
||||
|
||||
__m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp));
|
||||
bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128()));
|
||||
}
|
||||
else
|
||||
bgcolor = _mm256_setzero_si256();
|
||||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor0 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade0 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor1 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade1 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor2 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade2 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor3 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade3 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
// Setup light
|
||||
int lightpos0 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos1 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos2 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos3 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
__m256i mlight = _mm256_set_epi16(
|
||||
256, lightpos3, lightpos3, lightpos3,
|
||||
256, lightpos2, lightpos2, lightpos2,
|
||||
256, lightpos1, lightpos1, lightpos1,
|
||||
256, lightpos0, lightpos0, lightpos0);
|
||||
|
||||
__m256i shade_fade_lit;
|
||||
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
|
||||
{
|
||||
__m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight);
|
||||
shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light);
|
||||
}
|
||||
else
|
||||
{
|
||||
shade_fade_lit = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
// Shade and blend
|
||||
__m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0);
|
||||
__m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0);
|
||||
__m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128()));
|
||||
__m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked));
|
||||
__m256i fgcolor = mifgcolor;
|
||||
fgcolor = Shade32_AVX2<ShadeModeT>(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m256i outcolor = Blend32_AVX2<BlendT>(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8)));
|
||||
if (mask0 & (1 << 31)) dest[0] = desttmp[0];
|
||||
if (mask0 & (1 << 30)) dest[1] = desttmp[1];
|
||||
if (mask0 & (1 << 29)) dest[2] = desttmp[2];
|
||||
if (mask0 & (1 << 28)) dest[3] = desttmp[3];
|
||||
|
||||
mask0 <<= 4;
|
||||
dest += 4;
|
||||
}
|
||||
|
||||
blockPosY = _mm_add_ps(blockPosY, gradientY);
|
||||
|
||||
dest += offset_next_line;
|
||||
}
|
||||
|
||||
// mask1 loop:
|
||||
for (int y = 0; y < 4; y++)
|
||||
{
|
||||
__m128 blockPosX = _mm_add_ps(blockPosY, gradientX);
|
||||
__m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W);
|
||||
__m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW));
|
||||
|
||||
__m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W);
|
||||
__m128i lightpospair = _mm_sub_epi32(
|
||||
_mm_set1_epi32(FRACUNIT),
|
||||
_mm_cvtps_epi32(_mm_mul_ps(
|
||||
_mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()),
|
||||
_mm_set1_ps((float)FRACUNIT))));
|
||||
lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight));
|
||||
|
||||
int32_t posU = _mm_cvtsi128_si32(posUV);
|
||||
int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4));
|
||||
int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8));
|
||||
int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12));
|
||||
int32_t lightpos = _mm_cvtsi128_si32(lightpospair);
|
||||
int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8));
|
||||
int32_t stepU = (nextU - posU) >> 3;
|
||||
int32_t stepV = (nextV - posV) >> 3;
|
||||
fixed_t lightstep = (lightnext - lightpos) >> 3;
|
||||
|
||||
for (int x = 0; x < 2; x++)
|
||||
{
|
||||
// Load bgcolor
|
||||
uint32_t desttmp[4];
|
||||
__m256i bgcolor;
|
||||
if (BlendT::Mode != (int)BlendModes::Opaque)
|
||||
{
|
||||
if (mask1 & (1 << 31)) desttmp[0] = dest[0];
|
||||
if (mask1 & (1 << 30)) desttmp[1] = dest[1];
|
||||
if (mask1 & (1 << 29)) desttmp[2] = dest[2];
|
||||
if (mask1 & (1 << 28)) desttmp[3] = dest[3];
|
||||
|
||||
__m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp));
|
||||
bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128()));
|
||||
}
|
||||
else
|
||||
bgcolor = _mm256_setzero_si256();
|
||||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor0 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade0 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor1 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade1 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor2 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade2 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
unsigned int ifgcolor3 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
unsigned int ifgshade3 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
// Setup light
|
||||
int lightpos0 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos1 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos2 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
int lightpos3 = lightpos >> 8;
|
||||
lightpos += lightstep;
|
||||
__m256i mlight = _mm256_set_epi16(
|
||||
256, lightpos3, lightpos3, lightpos3,
|
||||
256, lightpos2, lightpos2, lightpos2,
|
||||
256, lightpos1, lightpos1, lightpos1,
|
||||
256, lightpos0, lightpos0, lightpos0);
|
||||
|
||||
__m256i shade_fade_lit;
|
||||
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
|
||||
{
|
||||
__m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight);
|
||||
shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light);
|
||||
}
|
||||
else
|
||||
{
|
||||
shade_fade_lit = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
// Shade and blend
|
||||
__m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0);
|
||||
__m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0);
|
||||
__m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128()));
|
||||
__m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked));
|
||||
__m256i fgcolor = mifgcolor;
|
||||
fgcolor = Shade32_AVX2<ShadeModeT>(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m256i outcolor = Blend32_AVX2<BlendT>(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8)));
|
||||
if (mask1 & (1 << 31)) dest[0] = desttmp[0];
|
||||
if (mask1 & (1 << 30)) dest[1] = desttmp[1];
|
||||
if (mask1 & (1 << 29)) dest[2] = desttmp[2];
|
||||
if (mask1 & (1 << 28)) dest[3] = desttmp[3];
|
||||
|
||||
mask1 <<= 4;
|
||||
dest += 4;
|
||||
}
|
||||
|
||||
blockPosY = _mm_add_ps(blockPosY, gradientY);
|
||||
|
||||
dest += offset_next_line;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
|
@ -27,7 +27,7 @@
|
|||
namespace TriScreenDrawerModes
|
||||
{
|
||||
template<typename SamplerT, typename FilterModeT>
|
||||
FORCEINLINE unsigned int VECTORCALL Sample32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation)
|
||||
FORCEINLINE unsigned int VECTORCALL Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation)
|
||||
{
|
||||
uint32_t texel;
|
||||
if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz)
|
||||
|
@ -107,7 +107,7 @@ namespace TriScreenDrawerModes
|
|||
}
|
||||
|
||||
template<typename SamplerT>
|
||||
FORCEINLINE unsigned int VECTORCALL SampleShade32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos)
|
||||
FORCEINLINE unsigned int VECTORCALL SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos)
|
||||
{
|
||||
if (SamplerT::Mode == (int)Samplers::Shaded)
|
||||
{
|
||||
|
@ -143,7 +143,7 @@ namespace TriScreenDrawerModes
|
|||
}
|
||||
|
||||
template<typename ShadeModeT>
|
||||
FORCEINLINE __m128i VECTORCALL Shade32_SSE2(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light)
|
||||
FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light)
|
||||
{
|
||||
if (ShadeModeT::Mode == (int)ShadeMode::Simple)
|
||||
{
|
||||
|
@ -172,7 +172,7 @@ namespace TriScreenDrawerModes
|
|||
}
|
||||
|
||||
template<typename BlendT>
|
||||
FORCEINLINE __m128i VECTORCALL Blend32_SSE2(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha)
|
||||
FORCEINLINE __m128i VECTORCALL Blend32(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha)
|
||||
{
|
||||
if (BlendT::Mode == (int)BlendModes::Opaque)
|
||||
{
|
||||
|
@ -275,7 +275,7 @@ namespace TriScreenDrawerModes
|
|||
}
|
||||
|
||||
template<typename BlendT, typename SamplerT>
|
||||
class TriScreenDrawer32_SSE2
|
||||
class TriScreenDrawer32
|
||||
{
|
||||
public:
|
||||
static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
|
||||
|
@ -430,13 +430,13 @@ private:
|
|||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor[2], ifgshade[2];
|
||||
ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
|
@ -460,8 +460,8 @@ private:
|
|||
|
||||
// Shade and blend
|
||||
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
|
||||
fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor);
|
||||
|
@ -517,13 +517,13 @@ private:
|
|||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor[2], ifgshade[2];
|
||||
ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
|
@ -547,8 +547,8 @@ private:
|
|||
|
||||
// Shade and blend
|
||||
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
|
||||
fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storel_epi64((__m128i*)desttmp, outcolor);
|
||||
|
@ -606,13 +606,13 @@ private:
|
|||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor[2], ifgshade[2];
|
||||
ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
posV += stepV;
|
||||
|
||||
|
@ -636,8 +636,8 @@ private:
|
|||
|
||||
// Shade and blend
|
||||
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
|
||||
fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storel_epi64((__m128i*)desttmp, outcolor);
|
||||
|
@ -658,7 +658,7 @@ private:
|
|||
};
|
||||
|
||||
template<typename BlendT, typename SamplerT>
|
||||
class RectScreenDrawer32_SSE2
|
||||
class RectScreenDrawer32
|
||||
{
|
||||
public:
|
||||
static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread)
|
||||
|
@ -780,18 +780,18 @@ private:
|
|||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor[2], ifgshade[2];
|
||||
ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
|
||||
ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
posU += stepU;
|
||||
|
||||
// Shade and blend
|
||||
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
|
||||
fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
_mm_storel_epi64((__m128i*)dest, outcolor);
|
||||
|
@ -809,16 +809,16 @@ private:
|
|||
|
||||
// Sample fgcolor
|
||||
unsigned int ifgcolor[2], ifgshade[2];
|
||||
ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
|
||||
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
|
||||
ifgcolor[1] = 0;
|
||||
ifgshade[1] = 0;
|
||||
posU += stepU;
|
||||
|
||||
// Shade and blend
|
||||
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
|
||||
fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
|
||||
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
|
||||
|
||||
// Store result
|
||||
*dest = _mm_cvtsi128_si32(outcolor);
|
||||
|
|
|
@ -152,8 +152,14 @@ ShadedTriVertex PolyTriangleDrawer::shade_vertex(const TriMatrix &objectToClip,
|
|||
return sv;
|
||||
}
|
||||
|
||||
void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipvert)
|
||||
void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread)
|
||||
{
|
||||
// Cull, clip and generate additional vertices as needed
|
||||
TriVertex clippedvert[max_additional_vertices];
|
||||
int numclipvert = clipedge(vert, clippedvert);
|
||||
|
||||
#ifdef NO_SSE
|
||||
// Map to 2D viewport:
|
||||
for (int j = 0; j < numclipvert; j++)
|
||||
{
|
||||
auto &v = clippedvert[j];
|
||||
|
@ -168,11 +174,8 @@ void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipver
|
|||
v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f;
|
||||
v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert)
|
||||
{
|
||||
#else
|
||||
// Map to 2D viewport:
|
||||
__m128 mviewport_x = _mm_set1_ps((float)viewport_x);
|
||||
__m128 mviewport_y = _mm_set1_ps((float)viewport_y);
|
||||
__m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f);
|
||||
|
@ -203,21 +206,8 @@ void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numcl
|
|||
_mm_storeu_ps(&clippedvert[j + 2].x, vz);
|
||||
_mm_storeu_ps(&clippedvert[j + 3].x, vw);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread)
|
||||
{
|
||||
// Cull, clip and generate additional vertices as needed
|
||||
TriVertex clippedvert[max_additional_vertices];
|
||||
int numclipvert = CPU.bSSE2 ? clipedge_sse2(vert, clippedvert) : clipedge(vert, clippedvert);
|
||||
|
||||
// Map to 2D viewport:
|
||||
if (CPU.bSSE2)
|
||||
clip_to_viewport_sse2(clippedvert, numclipvert);
|
||||
else
|
||||
clip_to_viewport(clippedvert, numclipvert);
|
||||
|
||||
// Keep varyings in -128 to 128 range if possible
|
||||
if (numclipvert > 0)
|
||||
{
|
||||
|
@ -266,6 +256,7 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
|
|||
|
||||
// halfspace clip distances
|
||||
static const int numclipdistances = 7;
|
||||
#ifdef NO_SSE
|
||||
float clipdistance[numclipdistances * 3];
|
||||
bool needsclipping = false;
|
||||
float *clipd = clipdistance;
|
||||
|
@ -292,6 +283,43 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
|
|||
}
|
||||
return 3;
|
||||
}
|
||||
#else
|
||||
__m128 mx = _mm_loadu_ps(&verts[0].x);
|
||||
__m128 my = _mm_loadu_ps(&verts[1].x);
|
||||
__m128 mz = _mm_loadu_ps(&verts[2].x);
|
||||
__m128 mw = _mm_setzero_ps();
|
||||
_MM_TRANSPOSE4_PS(mx, my, mz, mw);
|
||||
__m128 clipd0 = _mm_add_ps(mx, mw);
|
||||
__m128 clipd1 = _mm_sub_ps(mw, mx);
|
||||
__m128 clipd2 = _mm_add_ps(my, mw);
|
||||
__m128 clipd3 = _mm_sub_ps(mw, my);
|
||||
__m128 clipd4 = _mm_add_ps(mz, mw);
|
||||
__m128 clipd5 = _mm_sub_ps(mw, mz);
|
||||
__m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f);
|
||||
__m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps());
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps()));
|
||||
if (_mm_movemask_ps(mneedsclipping) == 0)
|
||||
{
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
memcpy(clippedvert + i, verts + i, sizeof(TriVertex));
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
float clipdistance[numclipdistances * 4];
|
||||
_mm_storeu_ps(clipdistance, clipd0);
|
||||
_mm_storeu_ps(clipdistance + 4, clipd1);
|
||||
_mm_storeu_ps(clipdistance + 8, clipd2);
|
||||
_mm_storeu_ps(clipdistance + 12, clipd3);
|
||||
_mm_storeu_ps(clipdistance + 16, clipd4);
|
||||
_mm_storeu_ps(clipdistance + 20, clipd5);
|
||||
_mm_storeu_ps(clipdistance + 24, clipd6);
|
||||
#endif
|
||||
|
||||
// use barycentric weights while clipping vertices
|
||||
float weights[max_additional_vertices * 3 * 2];
|
||||
|
@ -314,6 +342,7 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
|
|||
for (int i = 0; i < inputverts; i++)
|
||||
{
|
||||
int j = (i + 1) % inputverts;
|
||||
#ifdef NO_SSE
|
||||
float clipdistance1 =
|
||||
clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] +
|
||||
clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] +
|
||||
|
@ -323,6 +352,17 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
|
|||
clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] +
|
||||
clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] +
|
||||
clipdistance[2 * numclipdistances + p] * input[j * 3 + 2];
|
||||
#else
|
||||
float clipdistance1 =
|
||||
clipdistance[0 + p * 4] * input[i * 3 + 0] +
|
||||
clipdistance[1 + p * 4] * input[i * 3 + 1] +
|
||||
clipdistance[2 + p * 4] * input[i * 3 + 2];
|
||||
|
||||
float clipdistance2 =
|
||||
clipdistance[0 + p * 4] * input[j * 3 + 0] +
|
||||
clipdistance[1 + p * 4] * input[j * 3 + 1] +
|
||||
clipdistance[2 + p * 4] * input[j * 3 + 2];
|
||||
#endif
|
||||
|
||||
// Clip halfspace
|
||||
if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices)
|
||||
|
@ -369,129 +409,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
|
|||
return inputverts;
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
int PolyTriangleDrawer::clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert)
|
||||
{
|
||||
// Clip and cull so that the following is true for all vertices:
|
||||
// -v.w <= v.x <= v.w
|
||||
// -v.w <= v.y <= v.w
|
||||
// -v.w <= v.z <= v.w
|
||||
|
||||
// halfspace clip distances
|
||||
static const int numclipdistances = 7;
|
||||
__m128 mx = _mm_loadu_ps(&verts[0].x);
|
||||
__m128 my = _mm_loadu_ps(&verts[1].x);
|
||||
__m128 mz = _mm_loadu_ps(&verts[2].x);
|
||||
__m128 mw = _mm_setzero_ps();
|
||||
_MM_TRANSPOSE4_PS(mx, my, mz, mw);
|
||||
__m128 clipd0 = _mm_add_ps(mx, mw);
|
||||
__m128 clipd1 = _mm_sub_ps(mw, mx);
|
||||
__m128 clipd2 = _mm_add_ps(my, mw);
|
||||
__m128 clipd3 = _mm_sub_ps(mw, my);
|
||||
__m128 clipd4 = _mm_add_ps(mz, mw);
|
||||
__m128 clipd5 = _mm_sub_ps(mw, mz);
|
||||
__m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f);
|
||||
__m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps());
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps()));
|
||||
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps()));
|
||||
if (_mm_movemask_ps(mneedsclipping) == 0)
|
||||
{
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
memcpy(clippedvert + i, verts + i, sizeof(TriVertex));
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
float clipdistance[numclipdistances * 4];
|
||||
_mm_storeu_ps(clipdistance, clipd0);
|
||||
_mm_storeu_ps(clipdistance + 4, clipd1);
|
||||
_mm_storeu_ps(clipdistance + 8, clipd2);
|
||||
_mm_storeu_ps(clipdistance + 12, clipd3);
|
||||
_mm_storeu_ps(clipdistance + 16, clipd4);
|
||||
_mm_storeu_ps(clipdistance + 20, clipd5);
|
||||
_mm_storeu_ps(clipdistance + 24, clipd6);
|
||||
|
||||
// use barycentric weights while clipping vertices
|
||||
float weights[max_additional_vertices * 3 * 2];
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
weights[i * 3 + 0] = 0.0f;
|
||||
weights[i * 3 + 1] = 0.0f;
|
||||
weights[i * 3 + 2] = 0.0f;
|
||||
weights[i * 3 + i] = 1.0f;
|
||||
}
|
||||
|
||||
// Clip against each halfspace
|
||||
float *input = weights;
|
||||
float *output = weights + max_additional_vertices * 3;
|
||||
int inputverts = 3;
|
||||
for (int p = 0; p < numclipdistances; p++)
|
||||
{
|
||||
// Clip each edge
|
||||
int outputverts = 0;
|
||||
for (int i = 0; i < inputverts; i++)
|
||||
{
|
||||
int j = (i + 1) % inputverts;
|
||||
float clipdistance1 =
|
||||
clipdistance[0 + p * 4] * input[i * 3 + 0] +
|
||||
clipdistance[1 + p * 4] * input[i * 3 + 1] +
|
||||
clipdistance[2 + p * 4] * input[i * 3 + 2];
|
||||
|
||||
float clipdistance2 =
|
||||
clipdistance[0 + p * 4] * input[j * 3 + 0] +
|
||||
clipdistance[1 + p * 4] * input[j * 3 + 1] +
|
||||
clipdistance[2 + p * 4] * input[j * 3 + 2];
|
||||
|
||||
// Clip halfspace
|
||||
if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices)
|
||||
{
|
||||
float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f;
|
||||
float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f;
|
||||
|
||||
// add t1 vertex
|
||||
for (int k = 0; k < 3; k++)
|
||||
output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1;
|
||||
outputverts++;
|
||||
|
||||
if (t2 != 1.0f && t2 > t1)
|
||||
{
|
||||
// add t2 vertex
|
||||
for (int k = 0; k < 3; k++)
|
||||
output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2;
|
||||
outputverts++;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::swap(input, output);
|
||||
inputverts = outputverts;
|
||||
if (inputverts == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
// Convert barycentric weights to actual vertices
|
||||
for (int i = 0; i < inputverts; i++)
|
||||
{
|
||||
auto &v = clippedvert[i];
|
||||
memset(&v, 0, sizeof(TriVertex));
|
||||
for (int w = 0; w < 3; w++)
|
||||
{
|
||||
float weight = input[i * 3 + w];
|
||||
v.x += verts[w].x * weight;
|
||||
v.y += verts[w].y * weight;
|
||||
v.z += verts[w].z * weight;
|
||||
v.w += verts[w].w * weight;
|
||||
v.u += verts[w].u * weight;
|
||||
v.v += verts[w].v * weight;
|
||||
}
|
||||
}
|
||||
return inputverts;
|
||||
}
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
DrawPolyTrianglesCommand::DrawPolyTrianglesCommand(const PolyDrawArgs &args, bool mirror)
|
||||
|
|
|
@ -47,12 +47,8 @@ private:
|
|||
static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v);
|
||||
static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread);
|
||||
static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread);
|
||||
static void clip_to_viewport(TriVertex *clippedvert, int numclipvert);
|
||||
|
||||
static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert);
|
||||
#ifndef NO_SSE
|
||||
static void clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert);
|
||||
static int clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert);
|
||||
#endif
|
||||
|
||||
static int viewport_x, viewport_y, viewport_width, viewport_height, dest_pitch, dest_width, dest_height;
|
||||
static bool dest_bgra;
|
||||
|
|
|
@ -36,20 +36,14 @@
|
|||
#include "poly_triangle.h"
|
||||
#include "swrenderer/drawers/r_draw_rgba.h"
|
||||
#include "screen_triangle.h"
|
||||
#include "poly_drawer32.h"
|
||||
#include "poly_drawer8.h"
|
||||
#ifndef NO_SSE
|
||||
#include "poly_drawer32_sse2.h"
|
||||
#else
|
||||
#include "poly_drawer32.h"
|
||||
#endif
|
||||
#include "poly_drawer8.h"
|
||||
#include "x86.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
class SSE2CPU { public: static const int HasSSE2 = 1; };
|
||||
class GenericCPU { public: static const int HasSSE2 = 0; };
|
||||
}
|
||||
|
||||
template<typename CPUType>
|
||||
class TriangleBlock
|
||||
{
|
||||
public:
|
||||
|
@ -123,17 +117,9 @@ private:
|
|||
void ClipTest();
|
||||
void StencilWrite();
|
||||
void SubsectorWrite();
|
||||
|
||||
#ifndef NO_SSE
|
||||
void CoverageTestSSE2();
|
||||
void StencilEqualTestSSE2();
|
||||
void SubsectorTestSSE2();
|
||||
void SubsectorWriteSSE2();
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename CPUType>
|
||||
TriangleBlock<CPUType>::TriangleBlock(const TriDrawTriangleArgs *args)
|
||||
TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
|
||||
{
|
||||
const TriVertex &v1 = *args->v1;
|
||||
const TriVertex &v2 = *args->v2;
|
||||
|
@ -162,32 +148,19 @@ TriangleBlock<CPUType>::TriangleBlock(const TriDrawTriangleArgs *args)
|
|||
const int X2 = (int)round(16.0f * v2.x);
|
||||
const int X3 = (int)round(16.0f * v3.x);
|
||||
#else
|
||||
int Y1, Y2, Y3, X1, X2, X3;
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
{
|
||||
int tempround[4 * 3];
|
||||
__m128 m16 = _mm_set1_ps(16.0f);
|
||||
__m128 mhalf = _mm_set1_ps(65536.5f);
|
||||
__m128i m65536 = _mm_set1_epi32(65536);
|
||||
_mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536));
|
||||
_mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536));
|
||||
_mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536));
|
||||
X1 = tempround[0];
|
||||
X2 = tempround[4];
|
||||
X3 = tempround[8];
|
||||
Y1 = tempround[1];
|
||||
Y2 = tempround[5];
|
||||
Y3 = tempround[9];
|
||||
}
|
||||
else
|
||||
{
|
||||
Y1 = (int)round(16.0f * v1.y);
|
||||
Y2 = (int)round(16.0f * v2.y);
|
||||
Y3 = (int)round(16.0f * v3.y);
|
||||
X1 = (int)round(16.0f * v1.x);
|
||||
X2 = (int)round(16.0f * v2.x);
|
||||
X3 = (int)round(16.0f * v3.x);
|
||||
}
|
||||
int tempround[4 * 3];
|
||||
__m128 m16 = _mm_set1_ps(16.0f);
|
||||
__m128 mhalf = _mm_set1_ps(65536.5f);
|
||||
__m128i m65536 = _mm_set1_epi32(65536);
|
||||
_mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536));
|
||||
_mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536));
|
||||
_mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536));
|
||||
const int X1 = tempround[0];
|
||||
const int X2 = tempround[4];
|
||||
const int X3 = tempround[8];
|
||||
const int Y1 = tempround[1];
|
||||
const int Y2 = tempround[5];
|
||||
const int Y3 = tempround[9];
|
||||
#endif
|
||||
|
||||
// Deltas
|
||||
|
@ -233,32 +206,28 @@ TriangleBlock<CPUType>::TriangleBlock(const TriDrawTriangleArgs *args)
|
|||
if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
|
||||
|
||||
#ifndef NO_SSE
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
{
|
||||
mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
|
||||
mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
|
||||
mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
|
||||
mFDY12x4 = _mm_set1_epi32(FDY12 * 4);
|
||||
mFDY23x4 = _mm_set1_epi32(FDY23 * 4);
|
||||
mFDY31x4 = _mm_set1_epi32(FDY31 * 4);
|
||||
mFDX12 = _mm_set1_epi32(FDX12);
|
||||
mFDX23 = _mm_set1_epi32(FDX23);
|
||||
mFDX31 = _mm_set1_epi32(FDX31);
|
||||
mC1 = _mm_set1_epi32(C1);
|
||||
mC2 = _mm_set1_epi32(C2);
|
||||
mC3 = _mm_set1_epi32(C3);
|
||||
mDX12 = _mm_set1_epi32(DX12);
|
||||
mDY12 = _mm_set1_epi32(DY12);
|
||||
mDX23 = _mm_set1_epi32(DX23);
|
||||
mDY23 = _mm_set1_epi32(DY23);
|
||||
mDX31 = _mm_set1_epi32(DX31);
|
||||
mDY31 = _mm_set1_epi32(DY31);
|
||||
}
|
||||
mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
|
||||
mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
|
||||
mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
|
||||
mFDY12x4 = _mm_set1_epi32(FDY12 * 4);
|
||||
mFDY23x4 = _mm_set1_epi32(FDY23 * 4);
|
||||
mFDY31x4 = _mm_set1_epi32(FDY31 * 4);
|
||||
mFDX12 = _mm_set1_epi32(FDX12);
|
||||
mFDX23 = _mm_set1_epi32(FDX23);
|
||||
mFDX31 = _mm_set1_epi32(FDX31);
|
||||
mC1 = _mm_set1_epi32(C1);
|
||||
mC2 = _mm_set1_epi32(C2);
|
||||
mC3 = _mm_set1_epi32(C3);
|
||||
mDX12 = _mm_set1_epi32(DX12);
|
||||
mDY12 = _mm_set1_epi32(DY12);
|
||||
mDX23 = _mm_set1_epi32(DX23);
|
||||
mDY23 = _mm_set1_epi32(DY23);
|
||||
mDX31 = _mm_set1_epi32(DX31);
|
||||
mDY31 = _mm_set1_epi32(DY31);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
|
||||
void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
|
||||
{
|
||||
// First block line for this thread
|
||||
int core = thread->core;
|
||||
|
@ -270,18 +239,9 @@ void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadD
|
|||
bool writeColor = args->uniforms->WriteColor();
|
||||
bool writeStencil = args->uniforms->WriteStencil();
|
||||
bool writeSubsector = args->uniforms->WriteSubsector();
|
||||
int bmode = (int)args->uniforms->BlendMode();
|
||||
|
||||
// Find the drawer function for the given blend mode
|
||||
#ifndef NO_SSE
|
||||
void(*drawFunc)(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *);
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32_SSE2[bmode] : ScreenTriangle::TriDrawers8[bmode];
|
||||
else
|
||||
drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode];
|
||||
#else
|
||||
int bmode = (int)args->uniforms->BlendMode();
|
||||
auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode];
|
||||
#endif
|
||||
|
||||
// Loop through blocks
|
||||
for (int y = start_miny; y < maxy; y += q * num_cores)
|
||||
|
@ -291,11 +251,7 @@ void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadD
|
|||
X = x;
|
||||
Y = y;
|
||||
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
CoverageTestSSE2();
|
||||
else
|
||||
CoverageTest();
|
||||
|
||||
CoverageTest();
|
||||
if (Mask0 == 0 && Mask1 == 0)
|
||||
continue;
|
||||
|
||||
|
@ -306,11 +262,7 @@ void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadD
|
|||
// To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test..
|
||||
if (!subsectorTest)
|
||||
{
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
StencilEqualTestSSE2();
|
||||
else
|
||||
StencilEqualTest();
|
||||
|
||||
StencilEqualTest();
|
||||
if (Mask0 == 0 && Mask1 == 0)
|
||||
continue;
|
||||
}
|
||||
|
@ -320,11 +272,7 @@ void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadD
|
|||
if (Mask0 == 0 && Mask1 == 0)
|
||||
continue;
|
||||
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
SubsectorTestSSE2();
|
||||
else
|
||||
SubsectorTest();
|
||||
|
||||
SubsectorTest();
|
||||
if (Mask0 == 0 && Mask1 == 0)
|
||||
continue;
|
||||
}
|
||||
|
@ -334,18 +282,14 @@ void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadD
|
|||
if (writeStencil)
|
||||
StencilWrite();
|
||||
if (writeSubsector)
|
||||
{
|
||||
if (CPUType::HasSSE2 == 1)
|
||||
SubsectorWriteSSE2();
|
||||
else
|
||||
SubsectorWrite();
|
||||
}
|
||||
SubsectorWrite();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::SubsectorTest()
|
||||
#ifdef NO_SSE
|
||||
|
||||
void TriangleBlock::SubsectorTest()
|
||||
{
|
||||
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||
|
@ -371,10 +315,9 @@ void TriangleBlock<CPUType>::SubsectorTest()
|
|||
Mask1 = Mask1 & mask1;
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
#else
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::SubsectorTestSSE2()
|
||||
void TriangleBlock::SubsectorTest()
|
||||
{
|
||||
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||
|
@ -402,8 +345,7 @@ void TriangleBlock<CPUType>::SubsectorTestSSE2()
|
|||
|
||||
#endif
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::ClipTest()
|
||||
void TriangleBlock::ClipTest()
|
||||
{
|
||||
static const uint32_t clipxmask[8] =
|
||||
{
|
||||
|
@ -437,8 +379,9 @@ void TriangleBlock<CPUType>::ClipTest()
|
|||
Mask1 = Mask1 & xmask & ymask1;
|
||||
}
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::StencilEqualTest()
|
||||
#ifdef NO_SSE
|
||||
|
||||
void TriangleBlock::StencilEqualTest()
|
||||
{
|
||||
// Stencil test the whole block, if possible
|
||||
int block = (X >> 3) + (Y >> 3) * stencilPitch;
|
||||
|
@ -481,10 +424,9 @@ void TriangleBlock<CPUType>::StencilEqualTest()
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
#else
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::StencilEqualTestSSE2()
|
||||
void TriangleBlock::StencilEqualTest()
|
||||
{
|
||||
// Stencil test the whole block, if possible
|
||||
int block = (X >> 3) + (Y >> 3) * stencilPitch;
|
||||
|
@ -550,8 +492,7 @@ void TriangleBlock<CPUType>::StencilEqualTestSSE2()
|
|||
|
||||
#endif
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::StencilGreaterEqualTest()
|
||||
void TriangleBlock::StencilGreaterEqualTest()
|
||||
{
|
||||
// Stencil test the whole block, if possible
|
||||
int block = (X >> 3) + (Y >> 3) * stencilPitch;
|
||||
|
@ -594,8 +535,9 @@ void TriangleBlock<CPUType>::StencilGreaterEqualTest()
|
|||
}
|
||||
}
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::CoverageTest()
|
||||
#ifdef NO_SSE
|
||||
|
||||
void TriangleBlock::CoverageTest()
|
||||
{
|
||||
// Corners of block
|
||||
int x0 = X << 4;
|
||||
|
@ -692,10 +634,9 @@ void TriangleBlock<CPUType>::CoverageTest()
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
#else
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::CoverageTestSSE2()
|
||||
void TriangleBlock::CoverageTest()
|
||||
{
|
||||
// Corners of block
|
||||
int x0 = X << 4;
|
||||
|
@ -805,8 +746,7 @@ void TriangleBlock<CPUType>::CoverageTestSSE2()
|
|||
|
||||
#endif
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::StencilWrite()
|
||||
void TriangleBlock::StencilWrite()
|
||||
{
|
||||
int block = (X >> 3) + (Y >> 3) * stencilPitch;
|
||||
uint8_t *stencilBlock = &stencilValues[block * 64];
|
||||
|
@ -856,8 +796,9 @@ void TriangleBlock<CPUType>::StencilWrite()
|
|||
}
|
||||
}
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::SubsectorWrite()
|
||||
#ifdef NO_SSE
|
||||
|
||||
void TriangleBlock::SubsectorWrite()
|
||||
{
|
||||
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||
|
@ -890,10 +831,9 @@ void TriangleBlock<CPUType>::SubsectorWrite()
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
#else
|
||||
|
||||
template<typename CPUType>
|
||||
void TriangleBlock<CPUType>::SubsectorWriteSSE2()
|
||||
void TriangleBlock::SubsectorWrite()
|
||||
{
|
||||
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||
|
@ -950,21 +890,8 @@ void TriangleBlock<CPUType>::SubsectorWriteSSE2()
|
|||
|
||||
void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
TriangleBlock<GenericCPU> block(args);
|
||||
TriangleBlock block(args);
|
||||
block.Loop(args, thread);
|
||||
#else
|
||||
if (CPU.bSSE2)
|
||||
{
|
||||
TriangleBlock<SSE2CPU> block(args);
|
||||
block.Loop(args, thread);
|
||||
}
|
||||
else
|
||||
{
|
||||
TriangleBlock<GenericCPU> block(args);
|
||||
block.Loop(args, thread);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) =
|
||||
|
@ -1021,37 +948,6 @@ void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDra
|
|||
&TriScreenDrawer32<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
|
||||
};
|
||||
|
||||
#ifndef NO_SSE
|
||||
|
||||
void(*ScreenTriangle::TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) =
|
||||
{
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureMasked
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAdd
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureSub
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureRevSub
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAddSrcColor
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedOpaque
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedMasked
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAdd
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedSub
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedRevSub
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAddSrcColor
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // Shaded
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // AddShaded
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // Stencil
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // AddStencil
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillOpaque
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAdd
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillSub
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillRevSub
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAddSrcColor
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::SkycapSampler>::Execute, // Skycap
|
||||
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) =
|
||||
{
|
||||
&RectScreenDrawer8<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
|
||||
|
@ -1106,35 +1002,4 @@ void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDr
|
|||
&RectScreenDrawer32<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
|
||||
};
|
||||
|
||||
#ifndef NO_SSE
|
||||
|
||||
void(*ScreenTriangle::RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) =
|
||||
{
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureMasked
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAdd
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureSub
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureRevSub
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAddSrcColor
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedOpaque
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedMasked
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAdd
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedSub
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedRevSub
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAddSrcColor
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // Shaded
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // AddShaded
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // Stencil
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // AddStencil
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillOpaque
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAdd
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillSub
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillRevSub
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAddSrcColor
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::SkycapSampler>::Execute, // Skycap
|
||||
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
int ScreenTriangle::FuzzStart = 0;
|
||||
|
|
|
@ -131,11 +131,6 @@ public:
|
|||
static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *);
|
||||
static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *);
|
||||
|
||||
#ifndef NO_SSE
|
||||
static void(*TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *);
|
||||
static void(*RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *);
|
||||
#endif
|
||||
|
||||
static int FuzzStart;
|
||||
};
|
||||
|
||||
|
|
|
@ -185,42 +185,21 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const
|
|||
sv.y = vy;
|
||||
sv.z = vz;
|
||||
sv.w = vw;
|
||||
#else
|
||||
__m128 m0 = _mm_loadu_ps(matrix);
|
||||
__m128 m1 = _mm_loadu_ps(matrix + 4);
|
||||
__m128 m2 = _mm_loadu_ps(matrix + 8);
|
||||
__m128 m3 = _mm_loadu_ps(matrix + 12);
|
||||
__m128 mv = _mm_loadu_ps(&v.x);
|
||||
m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3);
|
||||
ShadedTriVertex sv;
|
||||
_mm_storeu_ps(&sv.x, mv);
|
||||
#endif
|
||||
sv.u = v.u;
|
||||
sv.v = v.v;
|
||||
return sv;
|
||||
#else
|
||||
if (CPU.bSSE2)
|
||||
{
|
||||
__m128 m0 = _mm_loadu_ps(matrix);
|
||||
__m128 m1 = _mm_loadu_ps(matrix + 4);
|
||||
__m128 m2 = _mm_loadu_ps(matrix + 8);
|
||||
__m128 m3 = _mm_loadu_ps(matrix + 12);
|
||||
__m128 mv = _mm_loadu_ps(&v.x);
|
||||
m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3);
|
||||
ShadedTriVertex sv;
|
||||
_mm_storeu_ps(&sv.x, mv);
|
||||
sv.u = v.u;
|
||||
sv.v = v.v;
|
||||
return sv;
|
||||
}
|
||||
else
|
||||
{
|
||||
float vx = matrix[0 * 4 + 0] * v.x + matrix[1 * 4 + 0] * v.y + matrix[2 * 4 + 0] * v.z + matrix[3 * 4 + 0] * v.w;
|
||||
float vy = matrix[0 * 4 + 1] * v.x + matrix[1 * 4 + 1] * v.y + matrix[2 * 4 + 1] * v.z + matrix[3 * 4 + 1] * v.w;
|
||||
float vz = matrix[0 * 4 + 2] * v.x + matrix[1 * 4 + 2] * v.y + matrix[2 * 4 + 2] * v.z + matrix[3 * 4 + 2] * v.w;
|
||||
float vw = matrix[0 * 4 + 3] * v.x + matrix[1 * 4 + 3] * v.y + matrix[2 * 4 + 3] * v.z + matrix[3 * 4 + 3] * v.w;
|
||||
ShadedTriVertex sv;
|
||||
sv.x = vx;
|
||||
sv.y = vy;
|
||||
sv.z = vz;
|
||||
sv.w = vw;
|
||||
sv.u = v.u;
|
||||
sv.v = v.v;
|
||||
return sv;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
#include <sys/sysctl.h>
|
||||
#include <sys/time.h>
|
||||
#include <pthread.h>
|
||||
#include <libkern/OSAtomic.h>
|
||||
|
||||
#include "doomdef.h"
|
||||
#include "i_system.h"
|
||||
|
@ -124,12 +123,7 @@ void* TimerThreadFunc(void*)
|
|||
|
||||
if (!s_isTicFrozen)
|
||||
{
|
||||
// The following GCC/Clang intrinsic can be used instead of OS X specific function:
|
||||
// __sync_add_and_fetch(&s_tics, 1);
|
||||
// Although it's not supported on all platform/compiler combination,
|
||||
// e.g. GCC 4.0.1 with PowerPC target architecture
|
||||
|
||||
OSAtomicIncrement32(&s_tics);
|
||||
__sync_add_and_fetch(&s_tics, 1);
|
||||
}
|
||||
|
||||
s_timerStart = I_MSTime();
|
||||
|
|
|
@ -116,11 +116,22 @@
|
|||
|
||||
DFrameBuffer *CreateGLSWFrameBuffer(int width, int height, bool bgra, bool fullscreen);
|
||||
|
||||
CUSTOM_CVAR(Bool, vid_glswfb, true, CVAR_NOINITCALL | CVAR_GLOBALCONFIG | CVAR_NOINITCALL)
|
||||
{
|
||||
Printf("This won't take effect until " GAMENAME " is restarted.\n");
|
||||
}
|
||||
|
||||
EXTERN_CVAR(Bool, ticker )
|
||||
EXTERN_CVAR(Bool, vid_vsync)
|
||||
EXTERN_CVAR(Bool, vid_hidpi)
|
||||
|
||||
CUSTOM_CVAR(Bool, swtruecolor, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL)
|
||||
#if defined __ppc__ || defined __ppc64__
|
||||
static const bool TRUECOLOR_DEFAULT = false;
|
||||
#else // other than PowerPC
|
||||
static const bool TRUECOLOR_DEFAULT = true;
|
||||
#endif // PowerPC
|
||||
|
||||
CUSTOM_CVAR(Bool, swtruecolor, TRUECOLOR_DEFAULT, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL)
|
||||
{
|
||||
// Strictly speaking this doesn't require a mode switch, but it is the easiest
|
||||
// way to force a CreateFramebuffer call without a lot of refactoring.
|
||||
|
@ -551,9 +562,12 @@ CocoaVideo::CocoaVideo()
|
|||
|
||||
// Create OpenGL pixel format
|
||||
|
||||
NSOpenGLPixelFormat* pixelFormat = CreatePixelFormat(OpenGLProfile::Core);
|
||||
const OpenGLProfile defaultProfile = (1 == vid_renderer || vid_glswfb)
|
||||
? OpenGLProfile::Core
|
||||
: OpenGLProfile::Legacy;
|
||||
NSOpenGLPixelFormat* pixelFormat = CreatePixelFormat(defaultProfile);
|
||||
|
||||
if (nil == pixelFormat)
|
||||
if (nil == pixelFormat && OpenGLProfile::Core == defaultProfile)
|
||||
{
|
||||
pixelFormat = CreatePixelFormat(OpenGLProfile::Legacy);
|
||||
|
||||
|
@ -647,10 +661,20 @@ DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, c
|
|||
{
|
||||
fb = new OpenGLFrameBuffer(NULL, width, height, 32, 60, fullscreen);
|
||||
}
|
||||
else if (vid_glswfb)
|
||||
{
|
||||
fb = CreateGLSWFrameBuffer(width, height, bgra, fullscreen);
|
||||
|
||||
if (!fb->IsValid())
|
||||
{
|
||||
delete fb;
|
||||
|
||||
fb = new CocoaFrameBuffer(width, height, bgra, fullscreen);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//fb = new CocoaFrameBuffer(width, height, bgra, fullscreen);
|
||||
fb = CreateGLSWFrameBuffer(width, height, bgra, fullscreen);
|
||||
fb = new CocoaFrameBuffer(width, height, bgra, fullscreen);
|
||||
}
|
||||
|
||||
fb->SetFlash(flashColor, flashAmount);
|
||||
|
|
|
@ -2596,21 +2596,8 @@ bool S_ChangeMusic (const char *musicname, int order, bool looping, bool force)
|
|||
{
|
||||
if ((lumpnum = Wads.CheckNumForFullName (musicname, true, ns_music)) == -1)
|
||||
{
|
||||
if (strstr(musicname, "://") > musicname)
|
||||
{
|
||||
// Looks like a URL; try it as such.
|
||||
handle = I_RegisterURLSong(musicname);
|
||||
if (handle == NULL)
|
||||
{
|
||||
Printf ("Could not open \"%s\"\n", musicname);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Printf ("Music \"%s\" not found\n", musicname);
|
||||
return false;
|
||||
}
|
||||
Printf ("Music \"%s\" not found\n", musicname);
|
||||
return false;
|
||||
}
|
||||
if (handle == NULL)
|
||||
{
|
||||
|
|
|
@ -705,14 +705,13 @@ PFloat::PFloat(unsigned int size)
|
|||
Flags |= TYPE_Float;
|
||||
if (size == 8)
|
||||
{
|
||||
#ifdef __i386__
|
||||
// According to System V i386 ABI alignment of double type is 4
|
||||
// GCC and Clang for 32-bit Intel targets follow this requirement
|
||||
// However GCC has -malign-double option to enable 8-byte alignment
|
||||
// So calculation of the actual alignment is needed
|
||||
struct AlignmentCheck { uint8_t i; double d; };
|
||||
Align = static_cast<unsigned int>(offsetof(AlignmentCheck, d));
|
||||
#endif // __i386__
|
||||
if (sizeof(void*) == 4)
|
||||
{
|
||||
// Some ABIs for 32-bit platforms define alignment of double type as 4 bytes
|
||||
// Intel POSIX (System V ABI) and PowerPC Macs are examples of those
|
||||
struct AlignmentCheck { uint8_t i; double d; };
|
||||
Align = static_cast<unsigned int>(offsetof(AlignmentCheck, d));
|
||||
}
|
||||
|
||||
SetDoubleSymbols();
|
||||
}
|
||||
|
|
|
@ -558,25 +558,6 @@ MusInfo *I_RegisterCDSong (int track, int id)
|
|||
return info;
|
||||
}
|
||||
|
||||
//==========================================================================
|
||||
//
|
||||
//
|
||||
//
|
||||
//==========================================================================
|
||||
|
||||
MusInfo *I_RegisterURLSong (const char *url)
|
||||
{
|
||||
StreamSong *song;
|
||||
|
||||
song = new StreamSong(url);
|
||||
if (song->IsValid())
|
||||
{
|
||||
return song;
|
||||
}
|
||||
delete song;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//==========================================================================
|
||||
//
|
||||
// ungzip
|
||||
|
|
|
@ -57,7 +57,6 @@ class MusInfo;
|
|||
struct MidiDeviceSetting;
|
||||
MusInfo *I_RegisterSong (FileReader *reader, MidiDeviceSetting *device);
|
||||
MusInfo *I_RegisterCDSong (int track, int cdid = 0);
|
||||
MusInfo *I_RegisterURLSong (const char *url);
|
||||
|
||||
// The base music class. Everything is derived from this --------------------
|
||||
|
||||
|
|
|
@ -594,7 +594,6 @@ class StreamSong : public MusInfo
|
|||
{
|
||||
public:
|
||||
StreamSong (FileReader *reader);
|
||||
StreamSong (const char *url);
|
||||
~StreamSong ();
|
||||
void Play (bool looping, int subsong);
|
||||
void Pause ();
|
||||
|
|
|
@ -576,11 +576,6 @@ std::pair<SoundHandle,bool> SoundRenderer::LoadSoundVoc(uint8_t *sfxdata, int le
|
|||
return retval;
|
||||
}
|
||||
|
||||
SoundStream *SoundRenderer::OpenStream(const char *url, int flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
SoundDecoder *SoundRenderer::CreateDecoder(FileReader *reader)
|
||||
{
|
||||
SoundDecoder *decoder = NULL;
|
||||
|
|
|
@ -108,7 +108,6 @@ public:
|
|||
// Streaming sounds.
|
||||
virtual SoundStream *CreateStream (SoundStreamCallback callback, int buffbytes, int flags, int samplerate, void *userdata) = 0;
|
||||
virtual SoundStream *OpenStream (FileReader *reader, int flags) = 0;
|
||||
virtual SoundStream *OpenStream (const char *url, int flags);
|
||||
|
||||
// Starts a sound.
|
||||
virtual FISoundChannel *StartSound (SoundHandle sfx, float vol, int pitch, int chanflags, FISoundChannel *reuse_chan) = 0;
|
||||
|
|
|
@ -296,7 +296,7 @@ FString OPLMIDIDevice::GetStats()
|
|||
char star[3] = { TEXTCOLOR_ESCAPE, 'A', '*' };
|
||||
for (uint32_t i = 0; i < io->NumChannels; ++i)
|
||||
{
|
||||
if (voices[i].index == -1)
|
||||
if (voices[i].index == ~0u)
|
||||
{
|
||||
star[1] = CR_BRICK + 'A';
|
||||
}
|
||||
|
|
|
@ -90,11 +90,6 @@ StreamSong::StreamSong (FileReader *reader)
|
|||
m_Stream = GSnd->OpenStream (reader, SoundStream::Loop);
|
||||
}
|
||||
|
||||
StreamSong::StreamSong (const char *url)
|
||||
{
|
||||
m_Stream = GSnd->OpenStream (url, SoundStream::Loop);
|
||||
}
|
||||
|
||||
bool StreamSong::IsPlaying ()
|
||||
{
|
||||
if (m_Status != STATE_Stopped)
|
||||
|
|
|
@ -230,7 +230,7 @@ class OpenALSoundStream : public SoundStream
|
|||
if(Renderer->FreeSfx.Size() == 0)
|
||||
{
|
||||
FSoundChan *lowest = Renderer->FindLowestChannel();
|
||||
if(lowest) Renderer->StopChannel(lowest);
|
||||
if(lowest) Renderer->ForceStopChannel(lowest);
|
||||
|
||||
if(Renderer->FreeSfx.Size() == 0)
|
||||
return false;
|
||||
|
@ -810,6 +810,14 @@ OpenALSoundRenderer::OpenALSoundRenderer()
|
|||
return;
|
||||
}
|
||||
|
||||
ALCint refresh=0;
|
||||
alcGetIntegerv(Device, ALC_REFRESH, 1, &refresh);
|
||||
if(refresh > 0)
|
||||
{
|
||||
// Round up instead of down
|
||||
UpdateTimeMS = (1000+refresh-1) / refresh;
|
||||
}
|
||||
|
||||
ALCint numMono=0, numStereo=0;
|
||||
alcGetIntegerv(Device, ALC_MONO_SOURCES, 1, &numMono);
|
||||
alcGetIntegerv(Device, ALC_STEREO_SOURCES, 1, &numStereo);
|
||||
|
@ -1205,7 +1213,7 @@ std::pair<SoundHandle,bool> OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int
|
|||
ChannelConfig chans;
|
||||
SampleType type;
|
||||
int srate;
|
||||
uint32_t loop_start = 0, loop_end = 0;
|
||||
uint32_t loop_start = 0, loop_end = ~0u;
|
||||
bool startass = false, endass = false;
|
||||
|
||||
if (!memcmp(sfxdata, "OggS", 4) || !memcmp(sfxdata, "FLAC", 4))
|
||||
|
@ -1282,8 +1290,9 @@ std::pair<SoundHandle,bool> OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int
|
|||
|
||||
if (!startass) loop_start = Scale(loop_start, srate, 1000);
|
||||
if (!endass) loop_end = Scale(loop_end, srate, 1000);
|
||||
if (loop_start < 0) loop_start = 0;
|
||||
if (loop_end >= data.Size() / samplesize) loop_end = data.Size() / samplesize - 1;
|
||||
const uint32_t samples = data.Size() / samplesize;
|
||||
if (loop_start > samples) loop_start = 0;
|
||||
if (loop_end > samples) loop_end = samples;
|
||||
|
||||
if ((loop_start > 0 || loop_end > 0) && loop_end > loop_start && AL.SOFT_loop_points)
|
||||
{
|
||||
|
@ -1314,7 +1323,7 @@ void OpenALSoundRenderer::UnloadSound(SoundHandle sfx)
|
|||
if((ALuint)bufID == buffer)
|
||||
{
|
||||
FSoundChan *next = schan->NextChan;
|
||||
StopChannel(schan);
|
||||
ForceStopChannel(schan);
|
||||
schan = next;
|
||||
continue;
|
||||
}
|
||||
|
@ -1322,6 +1331,20 @@ void OpenALSoundRenderer::UnloadSound(SoundHandle sfx)
|
|||
schan = schan->NextChan;
|
||||
}
|
||||
|
||||
// Make sure to kill any currently fading sounds too
|
||||
for(auto iter = FadingSources.begin();iter != FadingSources.end();)
|
||||
{
|
||||
ALint bufID = 0;
|
||||
alGetSourcei(iter->first, AL_BUFFER, &bufID);
|
||||
if(static_cast<ALuint>(bufID) == buffer)
|
||||
{
|
||||
FreeSource(iter->first);
|
||||
iter = FadingSources.erase(iter);
|
||||
}
|
||||
else
|
||||
++iter;
|
||||
}
|
||||
|
||||
alDeleteBuffers(1, &buffer);
|
||||
getALError();
|
||||
}
|
||||
|
@ -1358,7 +1381,7 @@ FISoundChannel *OpenALSoundRenderer::StartSound(SoundHandle sfx, float vol, int
|
|||
if(FreeSfx.Size() == 0)
|
||||
{
|
||||
FSoundChan *lowest = FindLowestChannel();
|
||||
if(lowest) StopChannel(lowest);
|
||||
if(lowest) ForceStopChannel(lowest);
|
||||
|
||||
if(FreeSfx.Size() == 0)
|
||||
return NULL;
|
||||
|
@ -1461,7 +1484,7 @@ FISoundChannel *OpenALSoundRenderer::StartSound3D(SoundHandle sfx, SoundListener
|
|||
{
|
||||
if(lowest->Priority < priority || (lowest->Priority == priority &&
|
||||
lowest->DistanceSqr > dist_sqr))
|
||||
StopChannel(lowest);
|
||||
ForceStopChannel(lowest);
|
||||
}
|
||||
if(FreeSfx.Size() == 0)
|
||||
return NULL;
|
||||
|
@ -1665,16 +1688,8 @@ void OpenALSoundRenderer::ChannelVolume(FISoundChannel *chan, float volume)
|
|||
alSourcef(source, AL_GAIN, SfxVolume * volume);
|
||||
}
|
||||
|
||||
void OpenALSoundRenderer::StopChannel(FISoundChannel *chan)
|
||||
void OpenALSoundRenderer::FreeSource(ALuint source)
|
||||
{
|
||||
if(chan == NULL || chan->SysChannel == NULL)
|
||||
return;
|
||||
|
||||
ALuint source = GET_PTRID(chan->SysChannel);
|
||||
// Release first, so it can be properly marked as evicted if it's being
|
||||
// forcefully killed
|
||||
S_ChannelEnded(chan);
|
||||
|
||||
alSourceRewind(source);
|
||||
alSourcei(source, AL_BUFFER, 0);
|
||||
getALError();
|
||||
|
@ -1684,11 +1699,48 @@ void OpenALSoundRenderer::StopChannel(FISoundChannel *chan)
|
|||
PausableSfx.Delete(i);
|
||||
if((i=ReverbSfx.Find(source)) < ReverbSfx.Size())
|
||||
ReverbSfx.Delete(i);
|
||||
if((i=SfxGroup.Find(source)) < SfxGroup.Size())
|
||||
SfxGroup.Delete(i);
|
||||
|
||||
SfxGroup.Delete(SfxGroup.Find(source));
|
||||
FreeSfx.Push(source);
|
||||
}
|
||||
|
||||
void OpenALSoundRenderer::StopChannel(FISoundChannel *chan)
|
||||
{
|
||||
if(chan == NULL || chan->SysChannel == NULL)
|
||||
return;
|
||||
|
||||
ALuint source = GET_PTRID(chan->SysChannel);
|
||||
// Release first, so it can be properly marked as evicted if it's being killed
|
||||
S_ChannelEnded(chan);
|
||||
|
||||
ALint state = AL_INITIAL;
|
||||
alGetSourcei(source, AL_SOURCE_STATE, &state);
|
||||
if(state != AL_PLAYING)
|
||||
FreeSource(source);
|
||||
else
|
||||
{
|
||||
// The sound is being killed while playing, so set its gain to 0 and track it
|
||||
// as it fades.
|
||||
alSourcef(source, AL_GAIN, 0.f);
|
||||
getALError();
|
||||
|
||||
FadingSources.insert(std::make_pair(
|
||||
source, std::chrono::steady_clock::now().time_since_epoch().count()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
void OpenALSoundRenderer::ForceStopChannel(FISoundChannel *chan)
|
||||
{
|
||||
ALuint source = GET_PTRID(chan->SysChannel);
|
||||
if(!source) return;
|
||||
|
||||
S_ChannelEnded(chan);
|
||||
FreeSource(source);
|
||||
}
|
||||
|
||||
|
||||
unsigned int OpenALSoundRenderer::GetPosition(FISoundChannel *chan)
|
||||
{
|
||||
if(chan == NULL || chan->SysChannel == NULL)
|
||||
|
@ -1945,6 +1997,23 @@ void OpenALSoundRenderer::UpdateSounds()
|
|||
{
|
||||
alProcessUpdatesSOFT();
|
||||
|
||||
if(!FadingSources.empty())
|
||||
{
|
||||
auto cur_time = std::chrono::steady_clock::now().time_since_epoch();
|
||||
for(auto iter = FadingSources.begin();iter != FadingSources.end();)
|
||||
{
|
||||
auto time_diff = std::chrono::duration_cast<std::chrono::milliseconds>(cur_time -
|
||||
std::chrono::steady_clock::time_point::duration(iter->second));
|
||||
if(time_diff.count() >= UpdateTimeMS)
|
||||
{
|
||||
FreeSource(iter->first);
|
||||
iter = FadingSources.erase(iter);
|
||||
}
|
||||
else
|
||||
++iter;
|
||||
}
|
||||
}
|
||||
|
||||
if(ALC.EXT_disconnect)
|
||||
{
|
||||
ALCint connected = ALC_TRUE;
|
||||
|
@ -2030,17 +2099,18 @@ void OpenALSoundRenderer::PrintStatus()
|
|||
|
||||
FString OpenALSoundRenderer::GatherStats()
|
||||
{
|
||||
ALCint updates = 1;
|
||||
alcGetIntegerv(Device, ALC_REFRESH, 1, &updates);
|
||||
FString out;
|
||||
|
||||
ALCint refresh = 1;
|
||||
alcGetIntegerv(Device, ALC_REFRESH, 1, &refresh);
|
||||
getALCError(Device);
|
||||
|
||||
uint32_t total = Sources.Size();
|
||||
uint32_t used = SfxGroup.Size()+Streams.Size();
|
||||
uint32_t unused = FreeSfx.Size();
|
||||
|
||||
FString out;
|
||||
out.Format("%u sources (" TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" active, " TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" free), Update interval: " TEXTCOLOR_YELLOW"%d" TEXTCOLOR_NORMAL"ms",
|
||||
total, used, unused, 1000/updates);
|
||||
out.Format("%u sources (" TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" active, " TEXTCOLOR_YELLOW"%u" TEXTCOLOR_NORMAL" free), Update interval: " TEXTCOLOR_YELLOW"%.1f" TEXTCOLOR_NORMAL"ms",
|
||||
total, used, unused, 1000.f/static_cast<float>(refresh));
|
||||
return out;
|
||||
}
|
||||
|
||||
|
@ -2105,7 +2175,7 @@ void OpenALSoundRenderer::PurgeStoppedSources()
|
|||
{
|
||||
if(schan->SysChannel != NULL && src == GET_PTRID(schan->SysChannel))
|
||||
{
|
||||
StopChannel(schan);
|
||||
ForceStopChannel(schan);
|
||||
break;
|
||||
}
|
||||
schan = schan->NextChan;
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "i_sound.h"
|
||||
#include "s_sound.h"
|
||||
|
@ -200,8 +201,10 @@ private:
|
|||
void RemoveStream(OpenALSoundStream *stream);
|
||||
|
||||
void LoadReverb(const ReverbContainer *env);
|
||||
void FreeSource(ALuint source);
|
||||
void PurgeStoppedSources();
|
||||
static FSoundChan *FindLowestChannel();
|
||||
void ForceStopChannel(FISoundChannel *chan);
|
||||
|
||||
std::thread StreamThread;
|
||||
std::mutex StreamLock;
|
||||
|
@ -222,6 +225,10 @@ private:
|
|||
TArray<ALuint> ReverbSfx;
|
||||
TArray<ALuint> SfxGroup;
|
||||
|
||||
int UpdateTimeMS;
|
||||
using SourceTimeMap = std::unordered_map<ALuint,int64_t>;
|
||||
SourceTimeMap FadingSources;
|
||||
|
||||
const ReverbContainer *PrevEnvironment;
|
||||
|
||||
typedef TMap<uint16_t,ALuint> EffectMap;
|
||||
|
|
|
@ -140,7 +140,7 @@ void musicBlock::voiceKeyOn(uint32_t slot, uint32_t channo, GenMidiInstrument *i
|
|||
|
||||
// Work out the note to use. This is normally the same as
|
||||
// the key, unless it is a fixed pitch instrument.
|
||||
uint32_t note;
|
||||
int note;
|
||||
if (instrument->flags & GENMIDI_FLAG_FIXED) note = instrument->fixed_note;
|
||||
else if (channo == CHAN_PERCUSSION) note = 60;
|
||||
else note = key;
|
||||
|
@ -475,6 +475,6 @@ void musicBlock::stopAllVoices()
|
|||
{
|
||||
for (uint32_t i = 0; i < io->NumChannels; i++)
|
||||
{
|
||||
if (voices[i].index >= 0) releaseVoice(i, 1);
|
||||
if (voices[i].index != ~0u) releaseVoice(i, 1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -115,13 +115,7 @@ extern "C" unsigned __int64 __rdtsc(void);
|
|||
#pragma intrinsic(__rdtsc)
|
||||
inline unsigned __int64 rdtsc()
|
||||
{
|
||||
#ifndef _M_X64
|
||||
if (CPU.bRDTSC)
|
||||
#endif
|
||||
{
|
||||
return __rdtsc();
|
||||
}
|
||||
return 0;
|
||||
return __rdtsc();
|
||||
}
|
||||
#else
|
||||
inline uint64_t rdtsc()
|
||||
|
|
|
@ -778,7 +778,7 @@ bool DCanvas::ParseDrawTextureTags(FTexture *img, double x, double y, uint32_t t
|
|||
if (parms->lclip < clipleft) parms->lclip = clipleft;
|
||||
if (parms->rclip > clipleft + clipwidth) parms->rclip = clipleft + clipwidth;
|
||||
if (parms->uclip < cliptop) parms->uclip = cliptop;
|
||||
if (parms->dclip < cliptop + clipheight) parms->uclip = cliptop + clipheight;
|
||||
if (parms->dclip > cliptop + clipheight) parms->dclip = cliptop + clipheight;
|
||||
}
|
||||
|
||||
if (parms->uclip >= parms->dclip || parms->lclip >= parms->rclip)
|
||||
|
|
|
@ -410,7 +410,6 @@ void InitPalette ()
|
|||
R_InitColormaps ();
|
||||
}
|
||||
|
||||
void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
|
||||
void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
|
||||
|
||||
void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
|
||||
|
@ -435,37 +434,17 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in
|
|||
return;
|
||||
}
|
||||
#if defined(_M_X64) || defined(_M_IX86) || defined(__i386__) || defined(__amd64__)
|
||||
else if (CPU.bSSE2)
|
||||
else if (count >= 4)
|
||||
{
|
||||
if (count >= 4)
|
||||
int not3count = count & ~3;
|
||||
DoBlending_SSE2 (from, to, not3count, r, g, b, a);
|
||||
count &= 3;
|
||||
if (count <= 0)
|
||||
{
|
||||
int not3count = count & ~3;
|
||||
DoBlending_SSE2 (from, to, not3count, r, g, b, a);
|
||||
count &= 3;
|
||||
if (count <= 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
from += not3count;
|
||||
to += not3count;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(_M_IX86) || defined(__i386__)
|
||||
else if (CPU.bMMX)
|
||||
{
|
||||
if (count >= 4)
|
||||
{
|
||||
int not3count = count & ~3;
|
||||
DoBlending_MMX (from, to, not3count, r, g, b, a);
|
||||
count &= 3;
|
||||
if (count <= 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
from += not3count;
|
||||
to += not3count;
|
||||
return;
|
||||
}
|
||||
from += not3count;
|
||||
to += not3count;
|
||||
}
|
||||
#endif
|
||||
int i, ia;
|
||||
|
|
|
@ -31,11 +31,13 @@
|
|||
**
|
||||
*/
|
||||
|
||||
#include "gitinfo.h"
|
||||
|
||||
#ifndef __VERSION_H__
|
||||
#define __VERSION_H__
|
||||
|
||||
#ifdef _WIN32
|
||||
#include "gitinfo.h"
|
||||
#endif // _WIN32
|
||||
|
||||
const char *GetGitDescription();
|
||||
const char *GetGitHash();
|
||||
const char *GetGitTime();
|
||||
|
|
|
@ -707,7 +707,7 @@ void CalculateCPUSpeed()
|
|||
|
||||
QueryPerformanceFrequency (&freq);
|
||||
|
||||
if (freq.QuadPart != 0 && CPU.bRDTSC)
|
||||
if (freq.QuadPart != 0)
|
||||
{
|
||||
LARGE_INTEGER count1, count2;
|
||||
cycle_t ClockCalibration;
|
||||
|
|
|
@ -167,7 +167,7 @@ public:
|
|||
Win32GLVideo::Win32GLVideo(int parm) : m_Modes(NULL), m_IsFullscreen(false)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (CPU.bRDTSC) gl_CalculateCPUSpeed();
|
||||
gl_CalculateCPUSpeed();
|
||||
#endif
|
||||
I_SetWndProc();
|
||||
m_DisplayWidth = vid_defwidth;
|
||||
|
|
47
src/x86.cpp
47
src/x86.cpp
|
@ -244,9 +244,6 @@ void DumpCPUInfo(const CPUInfo *cpu)
|
|||
cpu->Family, cpu->Model, cpu->Stepping);
|
||||
}
|
||||
Printf(" Features:");
|
||||
if (cpu->bMMX) Printf(" MMX");
|
||||
if (cpu->bMMXPlus) Printf(" MMX+");
|
||||
if (cpu->bSSE) Printf(" SSE");
|
||||
if (cpu->bSSE2) Printf(" SSE2");
|
||||
if (cpu->bSSE3) Printf(" SSE3");
|
||||
if (cpu->bSSSE3) Printf(" SSSE3");
|
||||
|
@ -258,50 +255,6 @@ void DumpCPUInfo(const CPUInfo *cpu)
|
|||
}
|
||||
}
|
||||
|
||||
#if !defined(__amd64__) && !defined(_M_X64)
|
||||
|
||||
void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
|
||||
{
|
||||
__m64 blendcolor;
|
||||
__m64 blendalpha;
|
||||
__m64 zero;
|
||||
__m64 blending256;
|
||||
__m64 color1;
|
||||
__m64 color2;
|
||||
|
||||
zero = _mm_setzero_si64();
|
||||
#ifndef __GNUC__
|
||||
blending256.m64_i64 = 0x10001000100;
|
||||
#else
|
||||
blending256 = (__m64)0x10001000100ll;
|
||||
#endif
|
||||
|
||||
blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero); // 000000RR 00GG00BB
|
||||
blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero); // 000000AA 00AA00AA
|
||||
|
||||
blendcolor = _mm_mullo_pi16(blendcolor, blendalpha); // premultiply blend by alpha
|
||||
blendalpha = _mm_subs_pu16(blending256, blendalpha); // one minus alpha
|
||||
|
||||
// Do two colors per iteration: Count must be even
|
||||
for (count >>= 1; count > 0; --count)
|
||||
{
|
||||
color1 = *(__m64 *)from; // 00r2g2b2 00r1g1b1
|
||||
from += 2;
|
||||
color2 = _mm_unpackhi_pi8(color1, zero); // 000000r2 00g200b2
|
||||
color1 = _mm_unpacklo_pi8(color1, zero); // 000000r1 00g100b1
|
||||
color1 = _mm_mullo_pi16(blendalpha, color1); // 0000r1rr g1ggb1bb
|
||||
color2 = _mm_mullo_pi16(blendalpha, color2); // 0000r2rr g2ggb2bb
|
||||
color1 = _mm_adds_pu16(blendcolor, color1);
|
||||
color2 = _mm_adds_pu16(blendcolor, color2);
|
||||
color1 = _mm_srli_pi16(color1, 8);
|
||||
color2 = _mm_srli_pi16(color2, 8);
|
||||
*(__m64 *)to = _mm_packs_pu16(color1, color2); // 00r2g2b2 00r1g1b1
|
||||
to += 2;
|
||||
}
|
||||
_mm_empty();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue