From 60d78bcdf6c7d6f2aa31613ee60e483556821582 Mon Sep 17 00:00:00 2001 From: "alexey.lysiuk" Date: Thu, 27 Nov 2014 10:58:34 +0200 Subject: [PATCH] - implemented hqNx image scaling algorithms using MMX and SSE2 intrinsics --- src/gl/hqnx_asm/hq2x_asm.cpp | 182 +++++++++---------------- src/gl/hqnx_asm/hq3x_asm.cpp | 123 +++++++---------- src/gl/hqnx_asm/hq4x_asm.cpp | 212 +++++++++++------------------ src/gl/hqnx_asm/hqnx_asm.h | 193 ++++++++++++++++++++++++++ src/gl/hqnx_asm/hqnx_asm_Image.cpp | 4 + 5 files changed, 390 insertions(+), 324 deletions(-) diff --git a/src/gl/hqnx_asm/hq2x_asm.cpp b/src/gl/hqnx_asm/hq2x_asm.cpp index 59d52c180b..65e4603776 100644 --- a/src/gl/hqnx_asm/hq2x_asm.cpp +++ b/src/gl/hqnx_asm/hq2x_asm.cpp @@ -1,6 +1,7 @@ //hq2x filter demo program //---------------------------------------------------------- //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com ) +//Copyright (C) 2012-2014 Alexey Lysiuk // //This program is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public @@ -24,153 +25,104 @@ namespace HQnX_asm extern int LUT16to32[65536*2]; extern int RGBtoYUV[65536*2]; -static const __int64 reg_blank = 0; -static const __int64 const3 = 0x0003000300030003; -static const __int64 const5 = 0x0005000500050005; -static const __int64 const6 = 0x0006000600060006; -static const __int64 const14 = 0x000E000E000E000E; -static const __int64 treshold = 0x0000000000300706; +static const hq_vec const3 = hq_vec::expand(0x0003); +static const hq_vec const5 = hq_vec::expand(0x0005); +static const hq_vec const6 = hq_vec::expand(0x0006); +static const hq_vec const14 = hq_vec::expand(0x000E); inline void Interp1(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1*3+c2)/4; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - pmullw mm1, const3 - paddw mm1, mm2 - psrlw mm1, 2 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const3; + result += hq_vec::load(c2); + result >> 2; + + result.store(pc); } inline void Interp2(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*2+c2+c3) >> 2; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - psllw mm1, 1 - paddw mm1, mm2 - paddw mm1, mm3 - psrlw mm1, 2 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result << 1; + result += hq_vec::load(c2); + result += hq_vec::load(c3); + result >> 2; + + result.store(pc); } inline void Interp5(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1+c2)/2; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - paddw mm1, mm2 - psrlw mm1, 1 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result += hq_vec::load(c2); + result >> 1; + + result.store(pc); } inline void Interp6(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*5+c2*2+c3)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - pmullw mm1, const5 - psllw mm2, 1 - paddw mm1, mm3 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const5; + result += hq_vec::load(c2) << 1; + result += hq_vec::load(c3); + result >> 3; + + result.store(pc); } inline void Interp7(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*6+c2+c3)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - pmullw mm1, const6 - paddw mm2, mm3 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const6; + result += hq_vec::load(c2); + result += hq_vec::load(c3); + result >> 3; + + result.store(pc); } inline void Interp9(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*2+(c2+c3)*3)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - psllw mm1, 1 - paddw mm2, mm3 - pmullw mm2, const3 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c2); + + result += hq_vec::load(c3); + result *= const3; + result += hq_vec::load(c1) << 1; + result >> 3; + + result.store(pc); } inline void Interp10(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*14+c2+c3)/16; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - pmullw mm1, const14 - paddw mm2, mm3 - paddw mm1, mm2 - psrlw mm1, 4 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const14; + result += hq_vec::load(c2); + result += hq_vec::load(c3); + result >> 4; + + result.store(pc); } #define PIXEL00_0 *((int*)(pOut)) = c[5]; @@ -223,7 +175,7 @@ inline void Interp10(unsigned char * pc, int c1, int c2, int c3) #define PIXEL11_100 Interp10(pOut+BpL+4, c[5], c[6], c[8]); -int Diff(unsigned int w5, unsigned int w1); +bool Diff(const unsigned int, const unsigned int); void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ) { @@ -2985,7 +2937,7 @@ void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ) } pOut+=BpL; } - __asm emms + hq_vec::reset(); } } \ No newline at end of file diff --git a/src/gl/hqnx_asm/hq3x_asm.cpp b/src/gl/hqnx_asm/hq3x_asm.cpp index 8d1f3572ab..073ce66561 100644 --- a/src/gl/hqnx_asm/hq3x_asm.cpp +++ b/src/gl/hqnx_asm/hq3x_asm.cpp @@ -1,6 +1,7 @@ //hq3x filter demo program //---------------------------------------------------------- //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com ) +//Copyright (C) 2012-2014 Alexey Lysiuk // //This program is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public @@ -24,105 +25,73 @@ namespace HQnX_asm extern int LUT16to32[65536*2]; extern int RGBtoYUV[65536*2]; -static const __int64 reg_blank = 0; -static const __int64 const3 = 0x0003000300030003; -static const __int64 const7 = 0x0007000700070007; -static const __int64 treshold = 0x0000000000300706; +static const hq_vec const3 = hq_vec::expand(0x0003); +static const hq_vec const7 = hq_vec::expand(0x0007); inline void Interp1(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1*3+c2)/4; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - pmullw mm1, const3 - paddw mm1, mm2 - psrlw mm1, 2 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const3; + result += hq_vec::load(c2); + result >> 2; + + result.store(pc); } inline void Interp2(unsigned char * pc, int c1, int c2, int c3) { // *((int*)pc) = (c1*2+c2+c3)/4; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - psllw mm1, 1 - paddw mm1, mm2 - paddw mm1, mm3 - psrlw mm1, 2 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result << 1; + result += hq_vec::load(c2); + result += hq_vec::load(c3); + result >> 2; + + result.store(pc); } inline void Interp3(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1*7+c2)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - pmullw mm1, const7 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const7; + result += hq_vec::load(c2); + result >> 3; + + result.store(pc); } inline void Interp4(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*2+(c2+c3)*7)/16; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - psllw mm1, 1 - paddw mm2, mm3 - pmullw mm2, const7 - paddw mm1, mm2 - psrlw mm1, 4 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c2); + + result += hq_vec::load(c3); + result *= const7; + result += hq_vec::load(c1) << 1; + result >> 4; + + result.store(pc); } inline void Interp5(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1+c2)/2; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - paddw mm1, mm2 - psrlw mm1, 1 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result += hq_vec::load(c2); + result >> 1; + + result.store(pc); } #define PIXEL00_1M Interp1(pOut, c[5], c[1]); @@ -179,7 +148,7 @@ inline void Interp5(unsigned char * pc, int c1, int c2) #define PIXEL22_5 Interp5(pOut+BpL+BpL+8, c[6], c[8]); #define PIXEL22_C *((int*)(pOut+BpL+BpL+8)) = c[5]; -int Diff(unsigned int w5, unsigned int w1); +bool Diff(const unsigned int, const unsigned int); void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ) { @@ -3867,7 +3836,7 @@ void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ) pOut+=BpL; pOut+=BpL; } - __asm emms + hq_vec::reset(); } diff --git a/src/gl/hqnx_asm/hq4x_asm.cpp b/src/gl/hqnx_asm/hq4x_asm.cpp index 57a496c726..34a4744055 100644 --- a/src/gl/hqnx_asm/hq4x_asm.cpp +++ b/src/gl/hqnx_asm/hq4x_asm.cpp @@ -1,6 +1,7 @@ //hq4x filter demo program //---------------------------------------------------------- //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com ) +//Copyright (C) 2012-2014 Alexey Lysiuk // //This program is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public @@ -19,7 +20,6 @@ #include #include -#include #include #include "hqnx_asm.h" @@ -29,148 +29,103 @@ namespace HQnX_asm int LUT16to32[65536*2]; int RGBtoYUV[65536*2]; -static const __int64 reg_blank = 0; -static const __int64 const3 = 0x0003000300030003; -static const __int64 const5 = 0x0005000500050005; -static const __int64 const6 = 0x0006000600060006; -static const __int64 const7 = 0x0007000700070007; -static const __int64 treshold = 0x0000000000300706; +static const hq_vec const3 = hq_vec::expand(0x0003); +static const hq_vec const5 = hq_vec::expand(0x0005); +static const hq_vec const6 = hq_vec::expand(0x0006); +static const hq_vec const7 = hq_vec::expand(0x0007); inline void Interp1(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1*3+c2)/4; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - pmullw mm1, const3 - paddw mm1, mm2 - psrlw mm1, 2 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const3; + result += hq_vec::load(c2); + result >> 2; + + result.store(pc); } inline void Interp2(unsigned char * pc, int c1, int c2, int c3) { // *((int*)pc) = (c1*2+c2+c3)/4; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - psllw mm1, 1 - paddw mm1, mm2 - paddw mm1, mm3 - psrlw mm1, 2 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result << 1; + result += hq_vec::load(c2); + result += hq_vec::load(c3); + result >> 2; + + result.store(pc); } inline void Interp3(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1*7+c2)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - pmullw mm1, const7 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const7; + result += hq_vec::load(c2); + result >> 3; + + result.store(pc); } inline void Interp5(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1+c2)/2; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - paddw mm1, mm2 - psrlw mm1, 1 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result += hq_vec::load(c2); + result >> 1; + + result.store(pc); } inline void Interp6(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*5+c2*2+c3)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - pmullw mm1, const5 - psllw mm2, 1 - paddw mm1, mm3 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const5; + result += hq_vec::load(c2) << 1; + result += hq_vec::load(c3); + result >> 3; + + result.store(pc); } inline void Interp7(unsigned char * pc, int c1, int c2, int c3) { //*((int*)pc) = (c1*6+c2+c3)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - movd mm3, c3 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - punpcklbw mm3, reg_blank - pmullw mm1, const6 - paddw mm2, mm3 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const6; + result += hq_vec::load(c2); + result += hq_vec::load(c3); + result >> 3; + + result.store(pc); } inline void Interp8(unsigned char * pc, int c1, int c2) { //*((int*)pc) = (c1*5+c2*3)/8; - __asm - { - mov eax, pc - movd mm1, c1 - movd mm2, c2 - punpcklbw mm1, reg_blank - punpcklbw mm2, reg_blank - pmullw mm1, const5 - pmullw mm2, const3 - paddw mm1, mm2 - psrlw mm1, 3 - packuswb mm1, reg_blank - movd [eax], mm1 - } + + hq_vec result = hq_vec::load(c1); + + result *= const5; + result += hq_vec::load(c2) * const3; + result >> 3; + + result.store(pc); } #define PIXEL00_0 *((int*)(pOut)) = c[5]; @@ -314,33 +269,26 @@ inline void Interp8(unsigned char * pc, int c1, int c2) #define PIXEL33_81 Interp8(pOut+BpL+BpL+BpL+12, c[5], c[6]); #define PIXEL33_82 Interp8(pOut+BpL+BpL+BpL+12, c[5], c[8]); - -#pragma warning(disable: 4035) - -int Diff(unsigned int w5, unsigned int w1) +bool Diff(const unsigned int rgb1, const unsigned int rgb2) { - __asm + if (rgb1 == rgb2) { - xor eax,eax - mov ebx,w5 - mov edx,w1 - cmp ebx,edx - je FIN - mov ecx,offset RGBtoYUV - movd mm1,[ecx + ebx*4] - movq mm5,mm1 - movd mm2,[ecx + edx*4] - psubusb mm1,mm2 - psubusb mm2,mm5 - por mm1,mm2 - psubusb mm1,treshold - movd eax,mm1 -FIN: + return false; } -} -// returns result in eax register + + static const hq_vec THRESHOLD = 0x00300706; + + const hq_vec yuv1 = RGBtoYUV[rgb1]; + const hq_vec yuv2 = RGBtoYUV[rgb2]; -#pragma warning(default: 4035) + const hq_vec delta1 = yuv1 - yuv2; + const hq_vec delta2 = yuv2 - yuv1; + + const hq_vec delta = delta1 | delta2; + const hq_vec result = delta - THRESHOLD; + + return 0 != result; +} void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ) { @@ -5412,7 +5360,7 @@ void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ) pOut += BpL; pOut += BpL; } - __asm emms + hq_vec::reset(); } void DLL InitLUTs() diff --git a/src/gl/hqnx_asm/hqnx_asm.h b/src/gl/hqnx_asm/hqnx_asm.h index 341e2dad30..2580f80ef4 100644 --- a/src/gl/hqnx_asm/hqnx_asm.h +++ b/src/gl/hqnx_asm/hqnx_asm.h @@ -2,6 +2,7 @@ //---------------------------------------------------------- //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com ) //Copyright (C) 2009 Benjamin Berkels +//Copyright (C) 2012-2014 Alexey Lysiuk // //This program is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public @@ -24,6 +25,198 @@ #include "hqnx_asm_Image.h" +// IMPORTANT NOTE! +// The following is not a generic vectorized math class +// Each member function or overloaded operator does specific task to simplify client code +// To re-implement this class for different platform you need check very carefully +// the Intel C++ Intrinsic Reference at http://software.intel.com/file/18072/ + +#if defined _MSC_VER && defined _M_X64 + +// Implementation via SSE2 intrinsics +// MSVC doesn't support MMX intrinsics on x64 + +#include + +class hq_vec +{ +public: + hq_vec(const int value) + : m_value(_mm_cvtsi32_si128(value)) + { + } + + static hq_vec load(const int source) + { + return _mm_unpacklo_epi8(_mm_cvtsi32_si128(source), _mm_cvtsi32_si128(0)); + } + + static hq_vec expand(const short source) + { + return _mm_set_epi16(source, source, source, source, source, source, source, source); + } + + void store(unsigned char* const destination) const + { + *reinterpret_cast(destination) = _mm_cvtsi128_si32(_mm_packus_epi16(m_value, _mm_cvtsi32_si128(0))); + } + + static void reset() + { + } + + hq_vec& operator+=(const hq_vec& right) + { + m_value = _mm_add_epi16(m_value, right.m_value); + return *this; + } + + hq_vec& operator*=(const hq_vec& right) + { + m_value = _mm_mullo_epi16(m_value, right.m_value); + return *this; + } + + hq_vec& operator<<(const int count) + { + m_value = _mm_sll_epi16(m_value, _mm_cvtsi32_si128(count)); + return *this; + } + + hq_vec& operator>>(const int count) + { + m_value = _mm_srl_epi16(m_value, _mm_cvtsi32_si128(count)); + return *this; + } + +private: + __m128i m_value; + + hq_vec(const __m128i value) + : m_value(value) + { + } + + friend hq_vec operator- (const hq_vec&, const hq_vec&); + friend hq_vec operator* (const hq_vec&, const hq_vec&); + friend hq_vec operator| (const hq_vec&, const hq_vec&); + friend bool operator!=(const int, const hq_vec&); +}; + +inline hq_vec operator-(const hq_vec& left, const hq_vec& right) +{ + return _mm_subs_epu8(left.m_value, right.m_value); +} + +inline hq_vec operator*(const hq_vec& left, const hq_vec& right) +{ + return _mm_mullo_epi16(left.m_value, right.m_value); +} + +inline hq_vec operator|(const hq_vec& left, const hq_vec& right) +{ + return _mm_or_si128(left.m_value, right.m_value); +} + +inline bool operator!=(const int left, const hq_vec& right) +{ + return left != _mm_cvtsi128_si32(right.m_value); +} + +#else // _M_X64 + +// Implementation via MMX intrinsics + +#include + +class hq_vec +{ +public: + hq_vec(const int value) + : m_value(_mm_cvtsi32_si64(value)) + { + } + + static hq_vec load(const int source) + { + return _mm_unpacklo_pi8(_mm_cvtsi32_si64(source), _mm_cvtsi32_si64(0)); + } + + static hq_vec expand(const short source) + { + return _mm_set_pi16(source, source, source, source); + } + + void store(unsigned char* const destination) const + { + *reinterpret_cast(destination) = _mm_cvtsi64_si32(_mm_packs_pu16(m_value, _mm_cvtsi32_si64(0))); + } + + static void reset() + { + _mm_empty(); + } + + hq_vec& operator+=(const hq_vec& right) + { + m_value = _mm_add_pi16(m_value, right.m_value); + return *this; + } + + hq_vec& operator*=(const hq_vec& right) + { + m_value = _mm_mullo_pi16(m_value, right.m_value); + return *this; + } + + hq_vec& operator<<(const int count) + { + m_value = _mm_sll_pi16(m_value, _mm_cvtsi32_si64(count)); + return *this; + } + + hq_vec& operator>>(const int count) + { + m_value = _mm_srl_pi16(m_value, _mm_cvtsi32_si64(count)); + return *this; + } + +private: + __m64 m_value; + + hq_vec(const __m64 value) + : m_value(value) + { + } + + friend hq_vec operator- (const hq_vec&, const hq_vec&); + friend hq_vec operator* (const hq_vec&, const hq_vec&); + friend hq_vec operator| (const hq_vec&, const hq_vec&); + friend bool operator!=(const int, const hq_vec&); +}; + +inline hq_vec operator-(const hq_vec& left, const hq_vec& right) +{ + return _mm_subs_pu8(left.m_value, right.m_value); +} + +inline hq_vec operator*(const hq_vec& left, const hq_vec& right) +{ + return _mm_mullo_pi16(left.m_value, right.m_value); +} + +inline hq_vec operator|(const hq_vec& left, const hq_vec& right) +{ + return _mm_or_si64(left.m_value, right.m_value); +} + +inline bool operator!=(const int left, const hq_vec& right) +{ + return left != _mm_cvtsi64_si32(right.m_value); +} + +#endif // _MSC_VER && _M_X64 + namespace HQnX_asm { void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL ); diff --git a/src/gl/hqnx_asm/hqnx_asm_Image.cpp b/src/gl/hqnx_asm/hqnx_asm_Image.cpp index be7d1b3502..7af1201739 100644 --- a/src/gl/hqnx_asm/hqnx_asm_Image.cpp +++ b/src/gl/hqnx_asm/hqnx_asm_Image.cpp @@ -20,6 +20,10 @@ #include #include "hqnx_asm_Image.h" +#ifndef _MSC_VER +#define _stricmp strcasecmp +#endif + namespace HQnX_asm {