mirror of
https://github.com/ZDoom/qzdoom-gpl.git
synced 2024-11-29 23:23:07 +00:00
Merge branch 'hqnx_intrinsics' of https://github.com/alexey-lysiuk/gzdoom
This commit is contained in:
commit
1389cd03ca
5 changed files with 390 additions and 324 deletions
|
@ -1,6 +1,7 @@
|
||||||
//hq2x filter demo program
|
//hq2x filter demo program
|
||||||
//----------------------------------------------------------
|
//----------------------------------------------------------
|
||||||
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
||||||
|
//Copyright (C) 2012-2014 Alexey Lysiuk
|
||||||
//
|
//
|
||||||
//This program is free software; you can redistribute it and/or
|
//This program is free software; you can redistribute it and/or
|
||||||
//modify it under the terms of the GNU Lesser General Public
|
//modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -24,153 +25,104 @@ namespace HQnX_asm
|
||||||
extern int LUT16to32[65536*2];
|
extern int LUT16to32[65536*2];
|
||||||
extern int RGBtoYUV[65536*2];
|
extern int RGBtoYUV[65536*2];
|
||||||
|
|
||||||
static const __int64 reg_blank = 0;
|
static const hq_vec const3 = hq_vec::expand(0x0003);
|
||||||
static const __int64 const3 = 0x0003000300030003;
|
static const hq_vec const5 = hq_vec::expand(0x0005);
|
||||||
static const __int64 const5 = 0x0005000500050005;
|
static const hq_vec const6 = hq_vec::expand(0x0006);
|
||||||
static const __int64 const6 = 0x0006000600060006;
|
static const hq_vec const14 = hq_vec::expand(0x000E);
|
||||||
static const __int64 const14 = 0x000E000E000E000E;
|
|
||||||
static const __int64 treshold = 0x0000000000300706;
|
|
||||||
|
|
||||||
inline void Interp1(unsigned char * pc, int c1, int c2)
|
inline void Interp1(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*3+c2)/4;
|
//*((int*)pc) = (c1*3+c2)/4;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const3;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 2;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
pmullw mm1, const3
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 2
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*2+c2+c3) >> 2;
|
//*((int*)pc) = (c1*2+c2+c3) >> 2;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result << 1;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 2;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
psllw mm1, 1
|
|
||||||
paddw mm1, mm2
|
|
||||||
paddw mm1, mm3
|
|
||||||
psrlw mm1, 2
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp5(unsigned char * pc, int c1, int c2)
|
inline void Interp5(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1+c2)/2;
|
//*((int*)pc) = (c1+c2)/2;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result += hq_vec::load(c2);
|
||||||
movd mm2, c2
|
result >> 1;
|
||||||
punpcklbw mm1, reg_blank
|
|
||||||
punpcklbw mm2, reg_blank
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 1
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp6(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp6(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*5+c2*2+c3)/8;
|
//*((int*)pc) = (c1*5+c2*2+c3)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const5;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2) << 1;
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
pmullw mm1, const5
|
|
||||||
psllw mm2, 1
|
|
||||||
paddw mm1, mm3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp7(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp7(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*6+c2+c3)/8;
|
//*((int*)pc) = (c1*6+c2+c3)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const6;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
pmullw mm1, const6
|
|
||||||
paddw mm2, mm3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp9(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp9(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*2+(c2+c3)*3)/8;
|
//*((int*)pc) = (c1*2+(c2+c3)*3)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c2);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result += hq_vec::load(c3);
|
||||||
movd mm2, c2
|
result *= const3;
|
||||||
movd mm3, c3
|
result += hq_vec::load(c1) << 1;
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
psllw mm1, 1
|
|
||||||
paddw mm2, mm3
|
|
||||||
pmullw mm2, const3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp10(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp10(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*14+c2+c3)/16;
|
//*((int*)pc) = (c1*14+c2+c3)/16;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const14;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 4;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
pmullw mm1, const14
|
|
||||||
paddw mm2, mm3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 4
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define PIXEL00_0 *((int*)(pOut)) = c[5];
|
#define PIXEL00_0 *((int*)(pOut)) = c[5];
|
||||||
|
@ -223,7 +175,7 @@ inline void Interp10(unsigned char * pc, int c1, int c2, int c3)
|
||||||
#define PIXEL11_100 Interp10(pOut+BpL+4, c[5], c[6], c[8]);
|
#define PIXEL11_100 Interp10(pOut+BpL+4, c[5], c[6], c[8]);
|
||||||
|
|
||||||
|
|
||||||
int Diff(unsigned int w5, unsigned int w1);
|
bool Diff(const unsigned int, const unsigned int);
|
||||||
|
|
||||||
void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
||||||
{
|
{
|
||||||
|
@ -2985,7 +2937,7 @@ void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
||||||
}
|
}
|
||||||
pOut+=BpL;
|
pOut+=BpL;
|
||||||
}
|
}
|
||||||
__asm emms
|
hq_vec::reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
//hq3x filter demo program
|
//hq3x filter demo program
|
||||||
//----------------------------------------------------------
|
//----------------------------------------------------------
|
||||||
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
||||||
|
//Copyright (C) 2012-2014 Alexey Lysiuk
|
||||||
//
|
//
|
||||||
//This program is free software; you can redistribute it and/or
|
//This program is free software; you can redistribute it and/or
|
||||||
//modify it under the terms of the GNU Lesser General Public
|
//modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -24,105 +25,73 @@ namespace HQnX_asm
|
||||||
extern int LUT16to32[65536*2];
|
extern int LUT16to32[65536*2];
|
||||||
extern int RGBtoYUV[65536*2];
|
extern int RGBtoYUV[65536*2];
|
||||||
|
|
||||||
static const __int64 reg_blank = 0;
|
static const hq_vec const3 = hq_vec::expand(0x0003);
|
||||||
static const __int64 const3 = 0x0003000300030003;
|
static const hq_vec const7 = hq_vec::expand(0x0007);
|
||||||
static const __int64 const7 = 0x0007000700070007;
|
|
||||||
static const __int64 treshold = 0x0000000000300706;
|
|
||||||
|
|
||||||
inline void Interp1(unsigned char * pc, int c1, int c2)
|
inline void Interp1(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*3+c2)/4;
|
//*((int*)pc) = (c1*3+c2)/4;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const3;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 2;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
pmullw mm1, const3
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 2
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
// *((int*)pc) = (c1*2+c2+c3)/4;
|
// *((int*)pc) = (c1*2+c2+c3)/4;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result << 1;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 2;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
psllw mm1, 1
|
|
||||||
paddw mm1, mm2
|
|
||||||
paddw mm1, mm3
|
|
||||||
psrlw mm1, 2
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp3(unsigned char * pc, int c1, int c2)
|
inline void Interp3(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*7+c2)/8;
|
//*((int*)pc) = (c1*7+c2)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const7;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
pmullw mm1, const7
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp4(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp4(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*2+(c2+c3)*7)/16;
|
//*((int*)pc) = (c1*2+(c2+c3)*7)/16;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c2);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result += hq_vec::load(c3);
|
||||||
movd mm2, c2
|
result *= const7;
|
||||||
movd mm3, c3
|
result += hq_vec::load(c1) << 1;
|
||||||
punpcklbw mm1, reg_blank
|
result >> 4;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
psllw mm1, 1
|
|
||||||
paddw mm2, mm3
|
|
||||||
pmullw mm2, const7
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 4
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp5(unsigned char * pc, int c1, int c2)
|
inline void Interp5(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1+c2)/2;
|
//*((int*)pc) = (c1+c2)/2;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result += hq_vec::load(c2);
|
||||||
movd mm2, c2
|
result >> 1;
|
||||||
punpcklbw mm1, reg_blank
|
|
||||||
punpcklbw mm2, reg_blank
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 1
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define PIXEL00_1M Interp1(pOut, c[5], c[1]);
|
#define PIXEL00_1M Interp1(pOut, c[5], c[1]);
|
||||||
|
@ -179,7 +148,7 @@ inline void Interp5(unsigned char * pc, int c1, int c2)
|
||||||
#define PIXEL22_5 Interp5(pOut+BpL+BpL+8, c[6], c[8]);
|
#define PIXEL22_5 Interp5(pOut+BpL+BpL+8, c[6], c[8]);
|
||||||
#define PIXEL22_C *((int*)(pOut+BpL+BpL+8)) = c[5];
|
#define PIXEL22_C *((int*)(pOut+BpL+BpL+8)) = c[5];
|
||||||
|
|
||||||
int Diff(unsigned int w5, unsigned int w1);
|
bool Diff(const unsigned int, const unsigned int);
|
||||||
|
|
||||||
void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
||||||
{
|
{
|
||||||
|
@ -3867,7 +3836,7 @@ void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
||||||
pOut+=BpL;
|
pOut+=BpL;
|
||||||
pOut+=BpL;
|
pOut+=BpL;
|
||||||
}
|
}
|
||||||
__asm emms
|
hq_vec::reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
//hq4x filter demo program
|
//hq4x filter demo program
|
||||||
//----------------------------------------------------------
|
//----------------------------------------------------------
|
||||||
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
||||||
|
//Copyright (C) 2012-2014 Alexey Lysiuk
|
||||||
//
|
//
|
||||||
//This program is free software; you can redistribute it and/or
|
//This program is free software; you can redistribute it and/or
|
||||||
//modify it under the terms of the GNU Lesser General Public
|
//modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -19,7 +20,6 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <conio.h>
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "hqnx_asm.h"
|
#include "hqnx_asm.h"
|
||||||
|
|
||||||
|
@ -29,148 +29,103 @@ namespace HQnX_asm
|
||||||
int LUT16to32[65536*2];
|
int LUT16to32[65536*2];
|
||||||
int RGBtoYUV[65536*2];
|
int RGBtoYUV[65536*2];
|
||||||
|
|
||||||
static const __int64 reg_blank = 0;
|
static const hq_vec const3 = hq_vec::expand(0x0003);
|
||||||
static const __int64 const3 = 0x0003000300030003;
|
static const hq_vec const5 = hq_vec::expand(0x0005);
|
||||||
static const __int64 const5 = 0x0005000500050005;
|
static const hq_vec const6 = hq_vec::expand(0x0006);
|
||||||
static const __int64 const6 = 0x0006000600060006;
|
static const hq_vec const7 = hq_vec::expand(0x0007);
|
||||||
static const __int64 const7 = 0x0007000700070007;
|
|
||||||
static const __int64 treshold = 0x0000000000300706;
|
|
||||||
|
|
||||||
|
|
||||||
inline void Interp1(unsigned char * pc, int c1, int c2)
|
inline void Interp1(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*3+c2)/4;
|
//*((int*)pc) = (c1*3+c2)/4;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const3;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 2;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
pmullw mm1, const3
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 2
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
// *((int*)pc) = (c1*2+c2+c3)/4;
|
// *((int*)pc) = (c1*2+c2+c3)/4;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result << 1;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 2;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
psllw mm1, 1
|
|
||||||
paddw mm1, mm2
|
|
||||||
paddw mm1, mm3
|
|
||||||
psrlw mm1, 2
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp3(unsigned char * pc, int c1, int c2)
|
inline void Interp3(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*7+c2)/8;
|
//*((int*)pc) = (c1*7+c2)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const7;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
pmullw mm1, const7
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp5(unsigned char * pc, int c1, int c2)
|
inline void Interp5(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1+c2)/2;
|
//*((int*)pc) = (c1+c2)/2;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result += hq_vec::load(c2);
|
||||||
movd mm2, c2
|
result >> 1;
|
||||||
punpcklbw mm1, reg_blank
|
|
||||||
punpcklbw mm2, reg_blank
|
result.store(pc);
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 1
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp6(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp6(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*5+c2*2+c3)/8;
|
//*((int*)pc) = (c1*5+c2*2+c3)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const5;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2) << 1;
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
pmullw mm1, const5
|
|
||||||
psllw mm2, 1
|
|
||||||
paddw mm1, mm3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp7(unsigned char * pc, int c1, int c2, int c3)
|
inline void Interp7(unsigned char * pc, int c1, int c2, int c3)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*6+c2+c3)/8;
|
//*((int*)pc) = (c1*6+c2+c3)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const6;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2);
|
||||||
movd mm3, c3
|
result += hq_vec::load(c3);
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
punpcklbw mm3, reg_blank
|
result.store(pc);
|
||||||
pmullw mm1, const6
|
|
||||||
paddw mm2, mm3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Interp8(unsigned char * pc, int c1, int c2)
|
inline void Interp8(unsigned char * pc, int c1, int c2)
|
||||||
{
|
{
|
||||||
//*((int*)pc) = (c1*5+c2*3)/8;
|
//*((int*)pc) = (c1*5+c2*3)/8;
|
||||||
__asm
|
|
||||||
{
|
hq_vec result = hq_vec::load(c1);
|
||||||
mov eax, pc
|
|
||||||
movd mm1, c1
|
result *= const5;
|
||||||
movd mm2, c2
|
result += hq_vec::load(c2) * const3;
|
||||||
punpcklbw mm1, reg_blank
|
result >> 3;
|
||||||
punpcklbw mm2, reg_blank
|
|
||||||
pmullw mm1, const5
|
result.store(pc);
|
||||||
pmullw mm2, const3
|
|
||||||
paddw mm1, mm2
|
|
||||||
psrlw mm1, 3
|
|
||||||
packuswb mm1, reg_blank
|
|
||||||
movd [eax], mm1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define PIXEL00_0 *((int*)(pOut)) = c[5];
|
#define PIXEL00_0 *((int*)(pOut)) = c[5];
|
||||||
|
@ -314,33 +269,26 @@ inline void Interp8(unsigned char * pc, int c1, int c2)
|
||||||
#define PIXEL33_81 Interp8(pOut+BpL+BpL+BpL+12, c[5], c[6]);
|
#define PIXEL33_81 Interp8(pOut+BpL+BpL+BpL+12, c[5], c[6]);
|
||||||
#define PIXEL33_82 Interp8(pOut+BpL+BpL+BpL+12, c[5], c[8]);
|
#define PIXEL33_82 Interp8(pOut+BpL+BpL+BpL+12, c[5], c[8]);
|
||||||
|
|
||||||
|
bool Diff(const unsigned int rgb1, const unsigned int rgb2)
|
||||||
#pragma warning(disable: 4035)
|
|
||||||
|
|
||||||
int Diff(unsigned int w5, unsigned int w1)
|
|
||||||
{
|
{
|
||||||
__asm
|
if (rgb1 == rgb2)
|
||||||
{
|
{
|
||||||
xor eax,eax
|
return false;
|
||||||
mov ebx,w5
|
|
||||||
mov edx,w1
|
|
||||||
cmp ebx,edx
|
|
||||||
je FIN
|
|
||||||
mov ecx,offset RGBtoYUV
|
|
||||||
movd mm1,[ecx + ebx*4]
|
|
||||||
movq mm5,mm1
|
|
||||||
movd mm2,[ecx + edx*4]
|
|
||||||
psubusb mm1,mm2
|
|
||||||
psubusb mm2,mm5
|
|
||||||
por mm1,mm2
|
|
||||||
psubusb mm1,treshold
|
|
||||||
movd eax,mm1
|
|
||||||
FIN:
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// returns result in eax register
|
|
||||||
|
|
||||||
#pragma warning(default: 4035)
|
static const hq_vec THRESHOLD = 0x00300706;
|
||||||
|
|
||||||
|
const hq_vec yuv1 = RGBtoYUV[rgb1];
|
||||||
|
const hq_vec yuv2 = RGBtoYUV[rgb2];
|
||||||
|
|
||||||
|
const hq_vec delta1 = yuv1 - yuv2;
|
||||||
|
const hq_vec delta2 = yuv2 - yuv1;
|
||||||
|
|
||||||
|
const hq_vec delta = delta1 | delta2;
|
||||||
|
const hq_vec result = delta - THRESHOLD;
|
||||||
|
|
||||||
|
return 0 != result;
|
||||||
|
}
|
||||||
|
|
||||||
void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
||||||
{
|
{
|
||||||
|
@ -5412,7 +5360,7 @@ void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
|
||||||
pOut += BpL;
|
pOut += BpL;
|
||||||
pOut += BpL;
|
pOut += BpL;
|
||||||
}
|
}
|
||||||
__asm emms
|
hq_vec::reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
void DLL InitLUTs()
|
void DLL InitLUTs()
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
//----------------------------------------------------------
|
//----------------------------------------------------------
|
||||||
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
|
||||||
//Copyright (C) 2009 Benjamin Berkels
|
//Copyright (C) 2009 Benjamin Berkels
|
||||||
|
//Copyright (C) 2012-2014 Alexey Lysiuk
|
||||||
//
|
//
|
||||||
//This program is free software; you can redistribute it and/or
|
//This program is free software; you can redistribute it and/or
|
||||||
//modify it under the terms of the GNU Lesser General Public
|
//modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -24,6 +25,198 @@
|
||||||
|
|
||||||
#include "hqnx_asm_Image.h"
|
#include "hqnx_asm_Image.h"
|
||||||
|
|
||||||
|
// IMPORTANT NOTE!
|
||||||
|
// The following is not a generic vectorized math class
|
||||||
|
// Each member function or overloaded operator does specific task to simplify client code
|
||||||
|
// To re-implement this class for different platform you need check very carefully
|
||||||
|
// the Intel C++ Intrinsic Reference at http://software.intel.com/file/18072/
|
||||||
|
|
||||||
|
#if defined _MSC_VER && defined _M_X64
|
||||||
|
|
||||||
|
// Implementation via SSE2 intrinsics
|
||||||
|
// MSVC doesn't support MMX intrinsics on x64
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
class hq_vec
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
hq_vec(const int value)
|
||||||
|
: m_value(_mm_cvtsi32_si128(value))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static hq_vec load(const int source)
|
||||||
|
{
|
||||||
|
return _mm_unpacklo_epi8(_mm_cvtsi32_si128(source), _mm_cvtsi32_si128(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
static hq_vec expand(const short source)
|
||||||
|
{
|
||||||
|
return _mm_set_epi16(source, source, source, source, source, source, source, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
void store(unsigned char* const destination) const
|
||||||
|
{
|
||||||
|
*reinterpret_cast<int*>(destination) = _mm_cvtsi128_si32(_mm_packus_epi16(m_value, _mm_cvtsi32_si128(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void reset()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator+=(const hq_vec& right)
|
||||||
|
{
|
||||||
|
m_value = _mm_add_epi16(m_value, right.m_value);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator*=(const hq_vec& right)
|
||||||
|
{
|
||||||
|
m_value = _mm_mullo_epi16(m_value, right.m_value);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator<<(const int count)
|
||||||
|
{
|
||||||
|
m_value = _mm_sll_epi16(m_value, _mm_cvtsi32_si128(count));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator>>(const int count)
|
||||||
|
{
|
||||||
|
m_value = _mm_srl_epi16(m_value, _mm_cvtsi32_si128(count));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
__m128i m_value;
|
||||||
|
|
||||||
|
hq_vec(const __m128i value)
|
||||||
|
: m_value(value)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
friend hq_vec operator- (const hq_vec&, const hq_vec&);
|
||||||
|
friend hq_vec operator* (const hq_vec&, const hq_vec&);
|
||||||
|
friend hq_vec operator| (const hq_vec&, const hq_vec&);
|
||||||
|
friend bool operator!=(const int, const hq_vec&);
|
||||||
|
};
|
||||||
|
|
||||||
|
inline hq_vec operator-(const hq_vec& left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return _mm_subs_epu8(left.m_value, right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline hq_vec operator*(const hq_vec& left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return _mm_mullo_epi16(left.m_value, right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline hq_vec operator|(const hq_vec& left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return _mm_or_si128(left.m_value, right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator!=(const int left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return left != _mm_cvtsi128_si32(right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // _M_X64
|
||||||
|
|
||||||
|
// Implementation via MMX intrinsics
|
||||||
|
|
||||||
|
#include <mmintrin.h>
|
||||||
|
|
||||||
|
class hq_vec
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
hq_vec(const int value)
|
||||||
|
: m_value(_mm_cvtsi32_si64(value))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static hq_vec load(const int source)
|
||||||
|
{
|
||||||
|
return _mm_unpacklo_pi8(_mm_cvtsi32_si64(source), _mm_cvtsi32_si64(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
static hq_vec expand(const short source)
|
||||||
|
{
|
||||||
|
return _mm_set_pi16(source, source, source, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
void store(unsigned char* const destination) const
|
||||||
|
{
|
||||||
|
*reinterpret_cast<int*>(destination) = _mm_cvtsi64_si32(_mm_packs_pu16(m_value, _mm_cvtsi32_si64(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void reset()
|
||||||
|
{
|
||||||
|
_mm_empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator+=(const hq_vec& right)
|
||||||
|
{
|
||||||
|
m_value = _mm_add_pi16(m_value, right.m_value);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator*=(const hq_vec& right)
|
||||||
|
{
|
||||||
|
m_value = _mm_mullo_pi16(m_value, right.m_value);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator<<(const int count)
|
||||||
|
{
|
||||||
|
m_value = _mm_sll_pi16(m_value, _mm_cvtsi32_si64(count));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
hq_vec& operator>>(const int count)
|
||||||
|
{
|
||||||
|
m_value = _mm_srl_pi16(m_value, _mm_cvtsi32_si64(count));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
__m64 m_value;
|
||||||
|
|
||||||
|
hq_vec(const __m64 value)
|
||||||
|
: m_value(value)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
friend hq_vec operator- (const hq_vec&, const hq_vec&);
|
||||||
|
friend hq_vec operator* (const hq_vec&, const hq_vec&);
|
||||||
|
friend hq_vec operator| (const hq_vec&, const hq_vec&);
|
||||||
|
friend bool operator!=(const int, const hq_vec&);
|
||||||
|
};
|
||||||
|
|
||||||
|
inline hq_vec operator-(const hq_vec& left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return _mm_subs_pu8(left.m_value, right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline hq_vec operator*(const hq_vec& left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return _mm_mullo_pi16(left.m_value, right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline hq_vec operator|(const hq_vec& left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return _mm_or_si64(left.m_value, right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator!=(const int left, const hq_vec& right)
|
||||||
|
{
|
||||||
|
return left != _mm_cvtsi64_si32(right.m_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // _MSC_VER && _M_X64
|
||||||
|
|
||||||
namespace HQnX_asm
|
namespace HQnX_asm
|
||||||
{
|
{
|
||||||
void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL );
|
void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL );
|
||||||
|
|
|
@ -20,6 +20,10 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "hqnx_asm_Image.h"
|
#include "hqnx_asm_Image.h"
|
||||||
|
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#define _stricmp strcasecmp
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace HQnX_asm
|
namespace HQnX_asm
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue