diff --git a/src/gl/hqnx_asm/hq2x_asm.cpp b/src/gl/hqnx_asm/hq2x_asm.cpp
index 59d52c180..65e460377 100644
--- a/src/gl/hqnx_asm/hq2x_asm.cpp
+++ b/src/gl/hqnx_asm/hq2x_asm.cpp
@@ -1,6 +1,7 @@
 //hq2x filter demo program
 //----------------------------------------------------------
 //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
+//Copyright (C) 2012-2014 Alexey Lysiuk
 //
 //This program is free software; you can redistribute it and/or
 //modify it under the terms of the GNU Lesser General Public
@@ -24,153 +25,104 @@ namespace HQnX_asm
 extern int LUT16to32[65536*2];
 extern int RGBtoYUV[65536*2];
 
-static const __int64 reg_blank = 0;
-static const __int64 const3    = 0x0003000300030003;
-static const __int64 const5    = 0x0005000500050005;
-static const __int64 const6    = 0x0006000600060006;
-static const __int64 const14   = 0x000E000E000E000E;
-static const __int64 treshold  = 0x0000000000300706;
+static const hq_vec const3  = hq_vec::expand(0x0003);
+static const hq_vec const5  = hq_vec::expand(0x0005);
+static const hq_vec const6  = hq_vec::expand(0x0006);
+static const hq_vec const14 = hq_vec::expand(0x000E);
 
 inline void Interp1(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1*3+c2)/4;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    pmullw     mm1, const3
-    paddw      mm1, mm2
-    psrlw      mm1, 2
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const3;
+  result += hq_vec::load(c2);
+  result >> 2;
+
+  result.store(pc);
 }
 
 inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*2+c2+c3) >> 2;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    psllw      mm1, 1
-    paddw      mm1, mm2
-    paddw      mm1, mm3
-    psrlw      mm1, 2
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result << 1;
+  result += hq_vec::load(c2);
+  result += hq_vec::load(c3);
+  result >> 2;
+
+  result.store(pc);
 }
 
 inline void Interp5(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1+c2)/2;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    paddw      mm1, mm2
-    psrlw      mm1, 1
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result += hq_vec::load(c2);
+  result >> 1;
+
+  result.store(pc);
 }
 
 inline void Interp6(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*5+c2*2+c3)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    pmullw     mm1, const5
-    psllw      mm2, 1
-    paddw      mm1, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const5;
+  result += hq_vec::load(c2) << 1;
+  result += hq_vec::load(c3);
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp7(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*6+c2+c3)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    pmullw     mm1, const6
-    paddw      mm2, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const6;
+  result += hq_vec::load(c2);
+  result += hq_vec::load(c3);
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp9(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*2+(c2+c3)*3)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    psllw      mm1, 1
-    paddw      mm2, mm3
-    pmullw     mm2, const3
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c2);
+
+  result += hq_vec::load(c3);
+  result *= const3;
+  result += hq_vec::load(c1) << 1;
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp10(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*14+c2+c3)/16;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    pmullw     mm1, const14
-    paddw      mm2, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 4
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const14;
+  result += hq_vec::load(c2);
+  result += hq_vec::load(c3);
+  result >> 4;
+
+  result.store(pc);
 }
 
 #define PIXEL00_0     *((int*)(pOut)) = c[5];
@@ -223,7 +175,7 @@ inline void Interp10(unsigned char * pc, int c1, int c2, int c3)
 #define PIXEL11_100   Interp10(pOut+BpL+4, c[5], c[6], c[8]);
 
 
-int Diff(unsigned int w5, unsigned int w1);
+bool Diff(const unsigned int, const unsigned int);
 
 void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
 {
@@ -2985,7 +2937,7 @@ void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
     }
     pOut+=BpL;
   }
-  __asm emms
+  hq_vec::reset();
 }
 
 }
\ No newline at end of file
diff --git a/src/gl/hqnx_asm/hq3x_asm.cpp b/src/gl/hqnx_asm/hq3x_asm.cpp
index 8d1f3572a..073ce6656 100644
--- a/src/gl/hqnx_asm/hq3x_asm.cpp
+++ b/src/gl/hqnx_asm/hq3x_asm.cpp
@@ -1,6 +1,7 @@
 //hq3x filter demo program
 //----------------------------------------------------------
 //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
+//Copyright (C) 2012-2014 Alexey Lysiuk
 //
 //This program is free software; you can redistribute it and/or
 //modify it under the terms of the GNU Lesser General Public
@@ -24,105 +25,73 @@ namespace HQnX_asm
 extern int   LUT16to32[65536*2];
 extern int   RGBtoYUV[65536*2];
 
-static const __int64 reg_blank = 0;
-static const __int64 const3    = 0x0003000300030003;
-static const __int64 const7    = 0x0007000700070007;
-static const __int64 treshold  = 0x0000000000300706;
+static const hq_vec const3 = hq_vec::expand(0x0003);
+static const hq_vec const7 = hq_vec::expand(0x0007);
 
 inline void Interp1(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1*3+c2)/4;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    pmullw     mm1, const3
-    paddw      mm1, mm2
-    psrlw      mm1, 2
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const3;
+  result += hq_vec::load(c2);
+  result >> 2;
+
+  result.store(pc);
 }
 
 inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
 {
 //  *((int*)pc) = (c1*2+c2+c3)/4;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    psllw      mm1, 1
-    paddw      mm1, mm2
-    paddw      mm1, mm3
-    psrlw      mm1, 2
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result << 1;
+  result += hq_vec::load(c2);
+  result += hq_vec::load(c3);
+  result >> 2;
+
+  result.store(pc);
 }
 
 inline void Interp3(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1*7+c2)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    pmullw     mm1, const7
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const7;
+  result += hq_vec::load(c2);
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp4(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*2+(c2+c3)*7)/16;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    psllw      mm1, 1
-    paddw      mm2, mm3
-    pmullw     mm2, const7
-    paddw      mm1, mm2
-    psrlw      mm1, 4
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c2);
+
+  result += hq_vec::load(c3);
+  result *= const7;
+  result += hq_vec::load(c1) << 1;
+  result >> 4;
+
+  result.store(pc);
 }
 
 inline void Interp5(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1+c2)/2;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    paddw      mm1, mm2
-    psrlw      mm1, 1
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result += hq_vec::load(c2);
+  result >> 1;
+
+  result.store(pc);
 }
 
 #define PIXEL00_1M  Interp1(pOut, c[5], c[1]);
@@ -179,7 +148,7 @@ inline void Interp5(unsigned char * pc, int c1, int c2)
 #define PIXEL22_5   Interp5(pOut+BpL+BpL+8, c[6], c[8]);
 #define PIXEL22_C   *((int*)(pOut+BpL+BpL+8)) = c[5];
 
-int Diff(unsigned int w5, unsigned int w1);
+bool Diff(const unsigned int, const unsigned int);
 
 void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
 {
@@ -3867,7 +3836,7 @@ void DLL hq3x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
     pOut+=BpL;
     pOut+=BpL;
   }
-  __asm emms
+  hq_vec::reset();
 }
 
 
diff --git a/src/gl/hqnx_asm/hq4x_asm.cpp b/src/gl/hqnx_asm/hq4x_asm.cpp
index 57a496c72..34a474405 100644
--- a/src/gl/hqnx_asm/hq4x_asm.cpp
+++ b/src/gl/hqnx_asm/hq4x_asm.cpp
@@ -1,6 +1,7 @@
 //hq4x filter demo program
 //----------------------------------------------------------
 //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
+//Copyright (C) 2012-2014 Alexey Lysiuk
 //
 //This program is free software; you can redistribute it and/or
 //modify it under the terms of the GNU Lesser General Public
@@ -19,7 +20,6 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <conio.h>
 #include <string.h>
 #include "hqnx_asm.h"
 
@@ -29,148 +29,103 @@ namespace HQnX_asm
 int   LUT16to32[65536*2];
 int   RGBtoYUV[65536*2];
 
-static const __int64 reg_blank = 0;
-static const __int64 const3    = 0x0003000300030003;
-static const __int64 const5    = 0x0005000500050005;
-static const __int64 const6    = 0x0006000600060006;
-static const __int64 const7    = 0x0007000700070007;
-static const __int64 treshold  = 0x0000000000300706;
+static const hq_vec const3 = hq_vec::expand(0x0003);
+static const hq_vec const5 = hq_vec::expand(0x0005);
+static const hq_vec const6 = hq_vec::expand(0x0006);
+static const hq_vec const7 = hq_vec::expand(0x0007);
 
 
 inline void Interp1(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1*3+c2)/4;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    pmullw     mm1, const3
-    paddw      mm1, mm2
-    psrlw      mm1, 2
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const3;
+  result += hq_vec::load(c2);
+  result >> 2;
+
+  result.store(pc);
 }
 
 inline void Interp2(unsigned char * pc, int c1, int c2, int c3)
 {
 //  *((int*)pc) = (c1*2+c2+c3)/4;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    psllw      mm1, 1
-    paddw      mm1, mm2
-    paddw      mm1, mm3
-    psrlw      mm1, 2
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result << 1;
+  result += hq_vec::load(c2);
+  result += hq_vec::load(c3);
+  result >> 2;
+
+  result.store(pc);
 }
 
 inline void Interp3(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1*7+c2)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    pmullw     mm1, const7
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const7;
+  result += hq_vec::load(c2);
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp5(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1+c2)/2;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    paddw      mm1, mm2
-    psrlw      mm1, 1
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result += hq_vec::load(c2);
+  result >> 1;
+
+  result.store(pc);
 }
 
 inline void Interp6(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*5+c2*2+c3)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    pmullw     mm1, const5
-    psllw      mm2, 1
-    paddw      mm1, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const5;
+  result += hq_vec::load(c2) << 1;
+  result += hq_vec::load(c3);
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp7(unsigned char * pc, int c1, int c2, int c3)
 {
   //*((int*)pc) = (c1*6+c2+c3)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    movd       mm3, c3
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    punpcklbw  mm3, reg_blank
-    pmullw     mm1, const6
-    paddw      mm2, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const6;
+  result += hq_vec::load(c2);
+  result += hq_vec::load(c3);
+  result >> 3;
+
+  result.store(pc);
 }
 
 inline void Interp8(unsigned char * pc, int c1, int c2)
 {
   //*((int*)pc) = (c1*5+c2*3)/8;
-  __asm
-  {
-    mov        eax, pc
-    movd       mm1, c1
-    movd       mm2, c2
-    punpcklbw  mm1, reg_blank
-    punpcklbw  mm2, reg_blank
-    pmullw     mm1, const5
-    pmullw     mm2, const3
-    paddw      mm1, mm2
-    psrlw      mm1, 3
-    packuswb   mm1, reg_blank
-    movd       [eax], mm1
-  }
+
+  hq_vec result = hq_vec::load(c1);
+
+  result *= const5;
+  result += hq_vec::load(c2) * const3;
+  result >> 3;
+
+  result.store(pc);
 }
 
 #define PIXEL00_0     *((int*)(pOut)) = c[5];
@@ -314,33 +269,26 @@ inline void Interp8(unsigned char * pc, int c1, int c2)
 #define PIXEL33_81    Interp8(pOut+BpL+BpL+BpL+12, c[5], c[6]);
 #define PIXEL33_82    Interp8(pOut+BpL+BpL+BpL+12, c[5], c[8]);
 
-
-#pragma warning(disable: 4035)
-
-int Diff(unsigned int w5, unsigned int w1)
+bool Diff(const unsigned int rgb1, const unsigned int rgb2)
 {
-  __asm
+  if (rgb1 == rgb2)
   {
-    xor     eax,eax
-    mov     ebx,w5
-    mov     edx,w1
-    cmp     ebx,edx
-    je      FIN
-    mov     ecx,offset RGBtoYUV 
-    movd    mm1,[ecx + ebx*4]
-    movq    mm5,mm1
-    movd    mm2,[ecx + edx*4]
-    psubusb mm1,mm2
-    psubusb mm2,mm5
-    por     mm1,mm2
-    psubusb mm1,treshold
-    movd    eax,mm1
-FIN:
+    return false;
   }
-}
-// returns result in eax register
+	
+  static const hq_vec THRESHOLD = 0x00300706;
+  
+  const hq_vec yuv1 = RGBtoYUV[rgb1];
+  const hq_vec yuv2 = RGBtoYUV[rgb2];
 
-#pragma warning(default: 4035)
+  const hq_vec delta1 = yuv1 - yuv2;
+  const hq_vec delta2 = yuv2 - yuv1;
+
+  const hq_vec delta  = delta1 | delta2;
+  const hq_vec result = delta - THRESHOLD;
+
+  return 0 != result;
+}
 
 void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
 {
@@ -5412,7 +5360,7 @@ void DLL hq4x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL )
     pOut += BpL;
     pOut += BpL;
   }
-  __asm emms
+  hq_vec::reset();
 }
 
 void DLL InitLUTs()
diff --git a/src/gl/hqnx_asm/hqnx_asm.h b/src/gl/hqnx_asm/hqnx_asm.h
index 341e2dad3..2580f80ef 100644
--- a/src/gl/hqnx_asm/hqnx_asm.h
+++ b/src/gl/hqnx_asm/hqnx_asm.h
@@ -2,6 +2,7 @@
 //----------------------------------------------------------
 //Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
 //Copyright (C) 2009 Benjamin Berkels
+//Copyright (C) 2012-2014 Alexey Lysiuk
 //
 //This program is free software; you can redistribute it and/or
 //modify it under the terms of the GNU Lesser General Public
@@ -24,6 +25,198 @@
 
 #include "hqnx_asm_Image.h"
 
+// IMPORTANT NOTE!
+// The following is not a generic vectorized math class
+// Each member function or overloaded operator does specific task to simplify client code
+// To re-implement this class for different platform you need check very carefully
+// the Intel C++ Intrinsic Reference at http://software.intel.com/file/18072/
+
+#if defined _MSC_VER && defined _M_X64
+
+// Implementation via SSE2 intrinsics
+// MSVC doesn't support MMX intrinsics on x64
+
+#include <emmintrin.h>
+
+class hq_vec
+{
+public:
+  hq_vec(const int value)
+  : m_value(_mm_cvtsi32_si128(value))
+  {
+  }
+
+  static hq_vec load(const int source)
+  {
+    return _mm_unpacklo_epi8(_mm_cvtsi32_si128(source), _mm_cvtsi32_si128(0));
+  }
+
+  static hq_vec expand(const short source)
+  {
+    return _mm_set_epi16(source, source, source, source, source, source, source, source);
+  }
+
+  void store(unsigned char* const destination) const
+  {
+    *reinterpret_cast<int*>(destination) = _mm_cvtsi128_si32(_mm_packus_epi16(m_value, _mm_cvtsi32_si128(0)));
+  }
+
+  static void reset()
+  {
+  }
+
+  hq_vec& operator+=(const hq_vec& right)
+  {
+    m_value = _mm_add_epi16(m_value, right.m_value);
+    return *this;
+  }
+
+  hq_vec& operator*=(const hq_vec& right)
+  {
+    m_value = _mm_mullo_epi16(m_value, right.m_value);
+    return *this;
+  }
+
+  hq_vec& operator<<(const int count)
+  {
+    m_value = _mm_sll_epi16(m_value, _mm_cvtsi32_si128(count));
+    return *this;
+  }
+
+  hq_vec& operator>>(const int count)
+  {
+    m_value = _mm_srl_epi16(m_value, _mm_cvtsi32_si128(count));
+    return *this;
+  }
+
+private:
+  __m128i m_value;
+
+  hq_vec(const __m128i value)
+  : m_value(value)
+  {
+  }
+
+  friend hq_vec operator- (const hq_vec&, const hq_vec&);
+  friend hq_vec operator* (const hq_vec&, const hq_vec&);
+  friend hq_vec operator| (const hq_vec&, const hq_vec&);
+  friend bool   operator!=(const int,     const hq_vec&);
+};
+
+inline hq_vec operator-(const hq_vec& left, const hq_vec& right)
+{
+  return _mm_subs_epu8(left.m_value, right.m_value);
+}
+
+inline hq_vec operator*(const hq_vec& left, const hq_vec& right)
+{
+  return _mm_mullo_epi16(left.m_value, right.m_value);
+}
+
+inline hq_vec operator|(const hq_vec& left, const hq_vec& right)
+{
+  return _mm_or_si128(left.m_value, right.m_value);
+}
+
+inline bool operator!=(const int left, const hq_vec& right)
+{
+  return left != _mm_cvtsi128_si32(right.m_value);
+}
+
+#else // _M_X64
+
+// Implementation via MMX intrinsics
+
+#include <mmintrin.h>
+
+class hq_vec
+{
+public:
+  hq_vec(const int value)
+  : m_value(_mm_cvtsi32_si64(value))
+  {
+  }
+
+  static hq_vec load(const int source)
+  {
+    return _mm_unpacklo_pi8(_mm_cvtsi32_si64(source), _mm_cvtsi32_si64(0));
+  }
+
+  static hq_vec expand(const short source)
+  {
+    return _mm_set_pi16(source, source, source, source);
+  }
+
+  void store(unsigned char* const destination) const
+  {
+    *reinterpret_cast<int*>(destination) = _mm_cvtsi64_si32(_mm_packs_pu16(m_value, _mm_cvtsi32_si64(0)));
+  }
+
+  static void reset()
+  {
+    _mm_empty();
+  }
+
+  hq_vec& operator+=(const hq_vec& right)
+  {
+    m_value = _mm_add_pi16(m_value, right.m_value);
+    return *this;
+  }
+
+  hq_vec& operator*=(const hq_vec& right)
+  {
+    m_value = _mm_mullo_pi16(m_value, right.m_value);
+    return *this;
+  }
+
+  hq_vec& operator<<(const int count)
+  {
+    m_value = _mm_sll_pi16(m_value, _mm_cvtsi32_si64(count));
+    return *this;
+  }
+
+  hq_vec& operator>>(const int count)
+  {
+    m_value = _mm_srl_pi16(m_value, _mm_cvtsi32_si64(count));
+    return *this;
+  }
+
+private:
+  __m64 m_value;
+
+  hq_vec(const __m64 value)
+  : m_value(value)
+  {
+  }
+
+  friend hq_vec operator- (const hq_vec&, const hq_vec&);
+  friend hq_vec operator* (const hq_vec&, const hq_vec&);
+  friend hq_vec operator| (const hq_vec&, const hq_vec&);
+  friend bool   operator!=(const int,     const hq_vec&);
+};
+
+inline hq_vec operator-(const hq_vec& left, const hq_vec& right)
+{
+  return _mm_subs_pu8(left.m_value, right.m_value);
+}
+
+inline hq_vec operator*(const hq_vec& left, const hq_vec& right)
+{
+  return _mm_mullo_pi16(left.m_value, right.m_value);
+}
+
+inline hq_vec operator|(const hq_vec& left, const hq_vec& right)
+{
+  return _mm_or_si64(left.m_value, right.m_value);
+}
+
+inline bool operator!=(const int left, const hq_vec& right)
+{
+  return left != _mm_cvtsi64_si32(right.m_value);
+}
+
+#endif // _MSC_VER && _M_X64
+
 namespace HQnX_asm
 {
 void DLL hq2x_32( int * pIn, unsigned char * pOut, int Xres, int Yres, int BpL );
diff --git a/src/gl/hqnx_asm/hqnx_asm_Image.cpp b/src/gl/hqnx_asm/hqnx_asm_Image.cpp
index be7d1b350..7af120173 100644
--- a/src/gl/hqnx_asm/hqnx_asm_Image.cpp
+++ b/src/gl/hqnx_asm/hqnx_asm_Image.cpp
@@ -20,6 +20,10 @@
 #include <string.h>
 #include "hqnx_asm_Image.h"
 
+#ifndef _MSC_VER
+#define _stricmp strcasecmp
+#endif
+
 namespace HQnX_asm
 {