Use multiple threads for xBRZ upscaling

Implementation relies on Concurrency Runtime, Grand Central Dispatch aka libdispatch or OpenMP depending on their availability
This commit is contained in:
alexey.lysiuk 2017-07-14 15:14:34 +03:00
parent ac811f99c0
commit 970adff5e7
2 changed files with 66 additions and 17 deletions

View file

@ -265,6 +265,16 @@ else( X64 )
set( CMAKE_CXX_FLAGS ${SAFE_CMAKE_CXX_FLAGS} )
endif( X64 )
CHECK_CXX_SOURCE_COMPILES("#include <ppl.h>
int main() { concurrency::parallel_for(0, 1, 1, [](int) { } ); }"
HAVE_PARALLEL_FOR)
if( NOT HAVE_PARALLEL_FOR )
CHECK_CXX_SOURCE_COMPILES("#include <dispatch/dispatch.h>
int main() { dispatch_apply(1, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^(size_t) { }); }"
HAVE_DISPATCH_APPLY)
endif()
# Set up flags for MSVC
if (MSVC)
set( CMAKE_CXX_FLAGS "/MP ${CMAKE_CXX_FLAGS}" )
@ -567,6 +577,23 @@ if( HAVE_MMX )
endif( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
endif( HAVE_MMX )
if( HAVE_PARALLEL_FOR )
add_definitions( -DHAVE_PARALLEL_FOR=1 )
elseif( HAVE_DISPATCH_APPLY )
add_definitions( -DHAVE_DISPATCH_APPLY=1 )
else()
option( NO_OPENMP "Disable usage of OpenMP" OFF )
if( NOT NO_OPENMP )
include( FindOpenMP )
if( OPENMP_FOUND )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
endif( OPENMP_FOUND )
endif( NOT NO_OPENMP )
endif()
add_custom_command( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/xlat_parser.c ${CMAKE_CURRENT_BINARY_DIR}/xlat_parser.h
COMMAND lemon -C${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/xlat/xlat_parser.y
DEPENDS lemon ${CMAKE_CURRENT_SOURCE_DIR}/xlat/xlat_parser.y )

View file

@ -46,16 +46,44 @@
#include "gl/xbr/xbrz.h"
#include "gl/xbr/xbrz_old.h"
#ifdef __APPLE__
# include <AvailabilityMacros.h>
# if MAC_OS_X_VERSION_MAX_ALLOWED >= 1060
# define GZ_USE_LIBDISPATCH
# endif // MAC_OS_X_VERSION_MAX_ALLOWED >= 1060
#endif // __APPLE__
#ifdef HAVE_PARALLEL_FOR
#ifdef GZ_USE_LIBDISPATCH
# include <dispatch/dispatch.h>
#endif // GZ_USE_LIBDISPATCH
#include <ppl.h>
template <typename Index, typename Function>
inline void parallel_for(const Index count, const Index step, const Function& function)
{
concurrency::parallel_for(0, count, step, function);
}
#elif defined HAVE_DISPATCH_APPLY
#include <dispatch/dispatch.h>
template <typename Index, typename Function>
inline void parallel_for(const Index count, const Index step, const Function& function)
{
const dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply(count / step + 1, queue, ^(size_t sliceY)
{
function(sliceY * step);
});
}
#else
template <typename Index, typename Function>
inline void parallel_for(const Index count, const Index step, const Function& function)
{
#pragma omp parallel for
for (Index i = 0; i < count; i += step)
{
function(i);
}
}
#endif // HAVE_PARALLEL_FOR
CUSTOM_CVAR(Int, gl_texture_hqresize, 0, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL)
{
@ -87,7 +115,6 @@ CVAR (Flag, gl_texture_hqresize_textures, gl_texture_hqresize_targets, 1);
CVAR (Flag, gl_texture_hqresize_sprites, gl_texture_hqresize_targets, 2);
CVAR (Flag, gl_texture_hqresize_fonts, gl_texture_hqresize_targets, 4);
#ifdef GZ_USE_LIBDISPATCH
CVAR(Bool, gl_texture_hqresize_multithread, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG);
CUSTOM_CVAR(Int, gl_texture_hqresize_mt_width, 16, CVAR_ARCHIVE | CVAR_GLOBALCONFIG)
@ -101,7 +128,6 @@ CUSTOM_CVAR(Int, gl_texture_hqresize_mt_height, 4, CVAR_ARCHIVE | CVAR_GLOBALCON
if (self < 2) self = 2;
if (self > 1024) self = 1024;
}
#endif // GZ_USE_LIBDISPATCH
static void scale2x ( uint32_t* inputBuffer, uint32_t* outputBuffer, int inWidth, int inHeight )
@ -289,7 +315,6 @@ static unsigned char *xbrzHelper( void (*xbrzFunction) ( size_t, const uint32_t*
unsigned char * newBuffer = new unsigned char[outWidth*outHeight*4];
#ifdef GZ_USE_LIBDISPATCH
const int thresholdWidth = gl_texture_hqresize_mt_width;
const int thresholdHeight = gl_texture_hqresize_mt_height;
@ -297,16 +322,13 @@ static unsigned char *xbrzHelper( void (*xbrzFunction) ( size_t, const uint32_t*
&& inWidth > thresholdWidth
&& inHeight > thresholdHeight)
{
const dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply(inHeight / thresholdHeight + 1, queue, ^(size_t sliceY)
parallel_for(inHeight, thresholdHeight, [=](int sliceY)
{
xbrzFunction(N, reinterpret_cast<uint32_t*>(inputBuffer), reinterpret_cast<uint32_t*>(newBuffer),
inWidth, inHeight, xbrz::ARGB, xbrz::ScalerCfg(), sliceY * thresholdHeight, (sliceY + 1) * thresholdHeight);
inWidth, inHeight, xbrz::ARGB, xbrz::ScalerCfg(), sliceY, sliceY + thresholdHeight);
});
}
else
#endif // GZ_USE_LIBDISPATCH
{
xbrzFunction(N, reinterpret_cast<uint32_t*>(inputBuffer), reinterpret_cast<uint32_t*>(newBuffer),
inWidth, inHeight, xbrz::ARGB, xbrz::ScalerCfg(), 0, std::numeric_limits<int>::max());