etqw-sdk/source/idlib/math/Simd_InstructionMacros.h

504 lines
21 KiB
C

// Copyright (C) 2007 Id Software, Inc.
//
#ifndef __SIMD_SSE_MACROS_H__
#define __SIMD_SSE_MACROS_H__
//#include <xmmintrin.h> // MSVC intrinsic SSE is very poor
/*
===============================================================================
Instruction Macros.
The first argument of an instruction macro is the destination and
the second argument is the source operand. For most instructions
the destination operand can be _xmm0 to _xmm7 only. The source
operand can be any one of the registers _xmm0 to _xmm7 or _eax,
_ecx, _edx, _esp, _ebp, _ebx, _esi, or _edi that contains the
effective address.
For instance: haddps xmm0, xmm1
becomes: _haddps( _xmm0, _xmm1 )
and: haddps xmm0, [esi]
becomes: _haddps( _xmm0, _esi )
The ADDRESS_ADD_C macro can be used when the effective source address
is formed by adding a constant to a general purpose register.
The constant must be in the range [-128, 127]. Use the ADDRESS_ADD_LC
macro if the constant is not in this range.
For instance: haddps xmm0, [esi+48]
becomes: _haddps( _xmm0, ADDRESS_ADD_C( _esi, 48 ) )
The ADDRESS_ADD_R macro can be used when the effective source address
is formed by adding two general purpose registers.
For instance: haddps xmm0, [esi+eax]
becomes: _haddps( _xmm0, ADDRESS_ADD_R( _esi, _eax ) )
The ADDRESS_ADD_RC macro can be used when the effective source address
is formed by adding two general purpose registers and a constant.
The constant must be in the range [-128, 127]. Use the ADDRESS_ADD_RLC
macro if the constant is not in this range.
For instance: haddps xmm0, [esi+eax+48]
becomes: _haddps( _xmm0, ADDRESS_ADD_RC( _esi, _eax, 48 ) )
The ADDRESS_ADD_RS macro can be used when the effective source address is formed
by adding a scaled general purpose register to another general purpose register.
The scale must be either 1, 2, 4 or 8.
For instance: haddps xmm0, [esi+eax*4]
becomes: _haddps( _xmm0, ADDRESS_ADD_RS( _esi, _eax, 4 ) )
The ADDRESS_ADD_RSC macro can be used when the effective source address is formed
by adding a scaled general purpose register to another general purpose register and
also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
be in the range [-128, 127]. Use the ADDRESS_ADD_RSLC macro if the constant is not
in this range.
For instance: haddps xmm0, [esi+eax*4+64]
becomes: _haddps( _xmm0, ADDRESS_ADD_RSC( _esi, _eax, 4, 64 ) )
===============================================================================
*/
/*
===============================================================================
Register constants.
===============================================================================
*/
#define _eax 0x00
#define _ecx 0x01
#define _edx 0x02
#define _ebx 0x03
#define _esp 0x04
#define _ebp 0x05
#define _esi 0x06
#define _edi 0x07
#define _mm0 0xC0
#define _mm1 0xC1
#define _mm2 0xC2
#define _mm3 0xC3
#define _mm4 0xC4
#define _mm5 0xC5
#define _mm6 0xC6
#define _mm7 0xC7
#define _xmm0 0xC0
#define _xmm1 0xC1
#define _xmm2 0xC2
#define _xmm3 0xC3
#define _xmm4 0xC4
#define _xmm5 0xC5
#define _xmm6 0xC6
#define _xmm7 0xC7
/*
===============================================================================
Addressing modes.
===============================================================================
*/
#define RSCALE( s ) ( ((s)&2)<<5 ) | ( ((s)&4)<<5 ) | ( ((s)&8)<<3 ) | ( ((s)&8)<<4 )
#define ADDRESS_ADD_C( reg0, constant ) 0x40 | ( reg0 & 7 ) \
_asm _emit constant
#define ADDRESS_ADD_LC( reg0, constant ) 0x80 | ( reg0 & 7 ) \
_asm _emit ( ( (constant) >> 0 ) & 255 ) \
_asm _emit ( ( (constant) >> 8 ) & 255 ) \
_asm _emit ( ( (constant) >> 16 ) & 255 ) \
_asm _emit ( ( (constant) >> 24 ) & 255 )
#define ADDRESS_ADD_R( reg0, reg1 ) 0x04 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
#define ADDRESS_ADD_RC( reg0, reg1, constant ) 0x44 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
_asm _emit constant
#define ADDRESS_ADD_RLC( reg0, reg1, constant ) 0x84 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
_asm _emit ( ( (constant) >> 0 ) & 255 ) \
_asm _emit ( ( (constant) >> 8 ) & 255 ) \
_asm _emit ( ( (constant) >> 16 ) & 255 ) \
_asm _emit ( ( (constant) >> 24 ) & 255 )
#define ADDRESS_ADD_RS( reg0, reg1, scale ) 0x04 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
#define ADDRESS_ADD_RSC( reg0, reg1, scale, constant ) 0x44 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
_asm _emit constant
#define ADDRESS_ADD_RSLC( reg0, reg1, scale, constant ) 0x84 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
_asm _emit ( ( (constant) >> 0 ) & 255 ) \
_asm _emit ( ( (constant) >> 8 ) & 255 ) \
_asm _emit ( ( (constant) >> 16 ) & 255 ) \
_asm _emit ( ( (constant) >> 24 ) & 255 )
/*
===============================================================================
Macros for third operand of shuffle/swizzle instructions.
===============================================================================
*/
#define SHUFFLE_PS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLE_PS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#define SHUFFLE_PD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
#define R_SHUFFLE_PD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
#define SHUFFLE_D( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
/*
===============================================================================
SSE compare instructions.
===============================================================================
*/
#define _EQ 0
#define _LT 1
#define _LE 2
#define _UNORDERED 3
#define _NEQ 4
#define _NLT 5
#define _NLE 6
#define _ORDERED 7
#define _cmpps( dst, src, cond ) \
_asm _emit 0x0F \
_asm _emit 0xC2 \
_asm _emit ( ( dst & 7 ) << 3 ) | src \
_asm _emit cond
// Equal ( dst[0-3] = ( dst[0-3] == src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpeps( dst, src ) _cmpps( dst, src, _EQ )
#define _cmpeqps( dst, src ) _cmpps( dst, src, _EQ )
// Not Equal ( dst[0-3] = ( dst[0-3] != src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpneps( dst, src ) _cmpps( dst, src, _NEQ )
#define _cmpneqps( dst, src ) _cmpps( dst, src, _NEQ )
// Less Than ( dst[0-3] = ( dst[0-3] < src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpltps( dst, src ) _cmpps( dst, src, _LT )
// Less Equal ( dst[0-3] = ( dst[0-3] <= src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpleps( dst, src ) _cmpps( dst, src, _LE )
// Greater Equal ( dst[0-3] = ( dst[0-3] >= src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpgeps( dst, src ) _cmpps( dst, src, _NLT )
// Greater Than ( dst[0-3] = ( dst[0-3] > src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpgtps( dst, src ) _cmpps( dst, src, _NLE )
// Not Less Than ( dst[0-3] = ( dst[0-3] >= src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpnltps( dst, src ) _cmpps( dst, src, _NLT )
// Not Less Equal ( dst[0-3] = ( dst[0-3] > src[0-3] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpnleps( dst, src ) _cmpps( dst, src, _NLE )
// Ordered ( dst[0-3] = ( dst[0-3] != NaN && src[0-3] != NaN ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpordps( dst, src ) _cmpps( dst, src, _ORDERED )
// Unordered ( dst[0-3] = ( dst[0-3] == NaN || src[0-3] == NaN ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpunordps( dst, src ) _cmpps( dst, src, _UNORDERED )
#define _cmpss( dst, src, cond ) \
_asm _eint 0xF3 \
_asm _emit 0x0F \
_asm _emit 0xC2 \
_asm _emit ( ( dst & 7 ) << 3 ) | src \
_asm _emit cond
// Equal ( dst[0] = ( dst[0] == src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpess( dst, src ) _cmpss( dst, src, _EQ )
#define _cmpeqss( dst, src ) _cmpss( dst, src, _EQ )
// Not Equal ( dst[0] = ( dst[0] != src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpness( dst, src ) _cmpss( dst, src, _NEQ )
#define _cmpneqss( dst, src ) _cmpss( dst, src, _NEQ )
// Less Than ( dst[0] = ( dst[0] < src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpltss( dst, src ) _cmpss( dst, src, _LT )
// Less Equal ( dst[0] = ( dst[0] <= src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpless( dst, src ) _cmpss( dst, src, _LE )
// Greater Equal ( dst[0] = ( dst[0] >= src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpgess( dst, src ) _cmpss( dst, src, _NLT )
// Greater Than ( dst[0] = ( dst[0] > src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpgtss( dst, src ) _cmpss( dst, src, _NLE )
// Not Less Than ( dst[0] = ( dst[0] >= src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpnltss( dst, src ) _cmpss( dst, src, _NLT )
// Not Less Equal ( dst[0] = ( dst[0] > src[0] ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpnless( dst, src ) _cmpss( dst, src, _NLE )
// Ordered ( dst[0] = ( dst[0] != NaN && src[0] != NaN ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpordss( dst, src ) _cmpss( dst, src, _ORDERED )
// Unordered ( dst[0] = ( dst[0] == NaN || src[0] == NaN ) ? 0xFFFFFFFF : 0x00000000 )
#define _cmpunordss( dst, src ) _cmpss( dst, src, _UNORDERED )
/*
===============================================================================
SSE3 instructions.
===============================================================================
*/
// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
#define _addsubps( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0xD0 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
#define _addsubpd( dst, src ) \
_asm _emit 0x66 \
_asm _emit 0x0F \
_asm _emit 0xD0 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
#define _haddps( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x7C \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
#define _haddpd( dst, src ) \
_asm _emit 0x66 \
_asm _emit 0x0F \
_asm _emit 0x7C \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
#define _hsubps( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x7D \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
#define _hsubpd( dst, src ) \
_asm _emit 0x66 \
_asm _emit 0x0F \
_asm _emit 0x7D \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
#define _movsldup( dst, src ) \
_asm _emit 0xF3 \
_asm _emit 0x0F \
_asm _emit 0x12 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
#define _movdldup( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x12 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
#define _movshdup( dst, src ) \
_asm _emit 0xF3 \
_asm _emit 0x0F \
_asm _emit 0x16 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
#define _movdhdup( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x16 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Load Unaligned Integer 128 bits
#define _lddqu( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0xF0 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
/*
===============================================================================
SSE transpose macros
===============================================================================
*/
// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
__asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
__asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
__asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
__asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
__asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
__asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
__asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
__asm shufps reg0, reg2, R_SHUFFLE_PS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \
__asm shufps reg1, reg2, R_SHUFFLE_PS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \
__asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
__asm shufps reg2, reg4, R_SHUFFLE_PS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \
__asm shufps reg3, reg4, R_SHUFFLE_PS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */
// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
__asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
__asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \
__asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \
__asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \
__asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \
__asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \
__asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \
__asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \
__asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
__asm shufps reg0, reg2, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \
__asm shufps reg1, reg2, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \
__asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
__asm shufps reg2, reg4, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \
__asm shufps reg3, reg4, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */
// transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
#define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
__asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \
__asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
__asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \
__asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \
__asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
__asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \
__asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
__asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
__asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \
__asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \
__asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \
__asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \
__asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \
__asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */
// transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
// NOTE: the middle two colums are interchanged which is much faster and fine for most calculations
#define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
__asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \
__asm shufps reg3, reg1, R_SHUFFLE_PS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \
__asm shufps reg2, reg0, R_SHUFFLE_PS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \
__asm shufps reg1, reg0, R_SHUFFLE_PS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \
__asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \
__asm shufps reg0, reg2, R_SHUFFLE_PS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \
__asm shufps reg1, reg3, R_SHUFFLE_PS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \
__asm shufps reg2, reg3, R_SHUFFLE_PS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */
// transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
// NOTE: the middle two colums are interchanged which is much faster and fine for most calculations
#define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
__asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
__asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \
__asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \
__asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \
__asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \
__asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \
__asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
__asm shufps reg0, reg2, R_SHUFFLE_PS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \
__asm shufps reg1, reg3, R_SHUFFLE_PS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \
__asm shufps reg2, reg3, R_SHUFFLE_PS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */
// transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
// NOTE: the middle two colums are interchanged which is much faster and fine for most calculations
#define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
__asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \
__asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
__asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \
__asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \
__asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
__asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
__asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \
__asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \
__asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \
__asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */
/*
===============================================================================
SSE2 transpose macros
===============================================================================
*/
// transpose a 4x4 integer matrix loaded into 4 xmm registers (reg4 is temporary)
// this is faster than the SSE float version because punpcklqdq has a lower latency than shufps
#define TRANSPOSE_4x4_INT( reg0, reg1, reg2, reg3, reg4 ) \
__asm movdqa reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
__asm punpckldq reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
__asm punpckhdq reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
__asm movdqa reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
__asm punpckldq reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
__asm punpckhdq reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
__asm movdqa reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
__asm punpcklqdq reg0, reg2 /* reg0 = 0, 4, 8, 12 */ \
__asm punpckhqdq reg1, reg2 /* reg1 = 1, 5, 9, 13 */ \
__asm movdqa reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
__asm punpcklqdq reg2, reg4 /* reg2 = 2, 6, 10, 14 */ \
__asm punpckhqdq reg3, reg4 /* reg3 = 3, 7, 11, 15 */
/*
===============================================================================
Macros to create SSE constants.
===============================================================================
*/
#define ALIGN4_INIT1( X, I0 ) ALIGN16( static X[4] ) = { I0, I0, I0, I0 }
#define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
#define ALIGN8_INIT1( X, I0 ) ALIGN16( static X[8] ) = { I0, I0, I0, I0, I0, I0, I0, I0 }
#define ALIGN16_INIT1( X, I0 ) ALIGN16( static X[16] ) = { I0, I0, I0, I0, I0, I0, I0, I0, I0, I0, I0, I0, I0, I0, I0, I0 }
#define IEEE_SP_ZERO 0
#define IEEE_SP_ONE 0x3f800000
#define IEEE_SP_SIGN ((unsigned long) ( 1 << 31 ))
#define IEEE_SP_INF ((unsigned long) ( 1 << 23 ))
/*
===============================================================================
Macro to remove boundary check from switch statements.
===============================================================================
*/
#ifdef _DEBUG
#define NODEFAULT default: assert( 0 )
#elif _WIN32
#define NODEFAULT default: __assume( 0 )
#else
#define NODEFAULT
#endif
#endif /* !__SIMD_SSE_MACROS_H__ */