// // Microsoft C inline assembler // //{{{ #ifdef __pragmas_h__ #ifndef __pragmas_x86_h__ #define __pragmas_x86_h__ static __inline int32_t sqr(int32_t a) { _asm { mov eax, a imul eax, eax } } static __inline int32_t scale(int32_t a, int32_t d, int32_t c) { _asm { mov eax, a imul d idiv c } } static __inline int32_t mulscale(int32_t a, int32_t d, int32_t c) { _asm { mov ecx, c mov eax, a imul d shrd eax, edx, cl } } #define _scaler(x) \ static __inline int32_t mulscale##x (int32_t a, int32_t d) \ { \ _asm mov eax, a \ _asm imul d \ _asm shrd eax, edx, x \ } \ static __inline int32_t dmulscale##x (int32_t a, int32_t d, int32_t S, int32_t D) \ { \ _asm mov eax, a \ _asm imul d \ _asm mov ebx, eax \ _asm mov eax, S \ _asm mov esi, edx \ _asm imul D \ _asm add eax, ebx \ _asm adc edx, esi \ _asm shrd eax, edx, x \ } \ PRAGMA_FUNCS #undef _scaler static __inline int32_t mulscale32(int32_t a, int32_t d) { _asm { mov eax, a imul d mov eax, edx } } static __inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c) { _asm { mov ecx, c mov eax, a imul d mov ebx, eax mov eax, S mov esi, edx imul D add eax, ebx adc edx, esi shrd eax, edx, cl } } static __inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) { _asm { mov eax, a imul d mov ebx, eax mov eax, S mov esi, edx imul D add eax, ebx adc edx, esi mov eax, edx } } #ifdef USE_ASM_DIVSCALE static __inline int32_t divscale(int32_t a, int32_t b, int32_t c) { _asm { mov eax, a mov ecx, c mov edx, eax shl eax, cl neg cl sar edx, cl idiv b } } static __inline int32_t divscale1(int32_t a, int32_t b) { _asm { mov eax, a add eax, eax sbb edx, edx idiv b } } static __inline int32_t divscale2(int32_t a, int32_t b) { _asm { mov eax, a mov edx, eax sar edx, 30 lea eax, [eax*4] idiv b } } static __inline int32_t divscale3(int32_t a, int32_t b) { _asm { mov eax, a mov edx, eax sar edx, 29 lea eax, [eax*8] idiv b } } #define DIVSCALE(x,y) \ static __inline int32_t divscale##y(int32_t a, int32_t b) \ { \ _asm mov eax, a \ _asm mov edx, eax \ _asm sar edx, x \ _asm shl eax, y \ _asm idiv b \ } DIVSCALE(28, 4) DIVSCALE(27, 5) DIVSCALE(26, 6) DIVSCALE(25, 7) DIVSCALE(24, 8) DIVSCALE(23, 9) DIVSCALE(22, 10) DIVSCALE(21, 11) DIVSCALE(20, 12) DIVSCALE(19, 13) DIVSCALE(18, 14) DIVSCALE(17, 15) DIVSCALE(16, 16) DIVSCALE(15, 17) DIVSCALE(14, 18) DIVSCALE(13, 19) DIVSCALE(12, 20) DIVSCALE(11, 21) DIVSCALE(10, 22) DIVSCALE(9, 23) DIVSCALE(8, 24) DIVSCALE(7, 25) DIVSCALE(6, 26) DIVSCALE(5, 27) DIVSCALE(4, 28) DIVSCALE(3, 29) DIVSCALE(2, 30) DIVSCALE(1, 31) static __inline int32_t divscale32(int32_t d, int32_t b) { _asm { mov edx, d xor eax, eax idiv b } } #endif // defined USE_ASM_DIVSCALE static __inline char readpixel(void *d) { _asm { mov edx, d mov al, byte ptr[edx] } } static __inline void drawpixel(void *d, char a) { _asm { mov edx, d mov al, a mov byte ptr[edx], al } } static __inline void drawpixels(void *d, int16_t a) { _asm { mov edx, d mov ax, a mov word ptr[edx], ax } } static __inline void drawpixelses(void *d, int32_t a) { _asm { mov edx, d mov eax, a mov dword ptr[edx], eax } } static __inline void clearbuf(void *d, int32_t c, int32_t a) { _asm { mov edi, d mov ecx, c mov eax, a rep stosd } } static __inline void clearbufbyte(void *d, int32_t c, int32_t a) { _asm { mov edi, d mov ecx, c mov eax, a cmp ecx, 4 jae longcopy test cl, 1 jz preskip stosb preskip : shr ecx, 1 rep stosw jmp endit longcopy : test edi, 1 jz skip1 stosb dec ecx skip1 : test edi, 2 jz skip2 stosw sub ecx, 2 skip2 : mov ebx, ecx shr ecx, 2 rep stosd test bl, 2 jz skip3 stosw skip3 : test bl, 1 jz endit stosb endit : } } static __inline void copybuf(const void *s, void *d, int32_t c) { _asm { mov esi, s mov edi, d mov ecx, c rep movsd } } static __inline void copybufbyte(const void *s, void *d, int32_t c) { _asm { mov esi, s mov edi, d mov ecx, c cmp ecx, 4 jae longcopy test cl, 1 jz preskip movsb preskip : shr ecx, 1 rep movsw jmp endit longcopy : test edi, 1 jz skip1 movsb dec ecx skip1 : test edi, 2 jz skip2 movsw sub ecx, 2 skip2 : mov ebx, ecx shr ecx, 2 rep movsd test bl, 2 jz skip3 movsw skip3 : test bl, 1 jz endit movsb endit : } } static __inline void copybufreverse(const void *s, void *d, int32_t c) { _asm { mov esi, s mov edi, d mov ecx, c shr ecx, 1 jnc skipit1 mov al, byte ptr[esi] dec esi mov byte ptr[edi], al inc edi skipit1 : shr ecx, 1 jnc skipit2 mov ax, word ptr[esi-1] sub esi, 2 ror ax, 8 mov word ptr[edi], ax add edi, 2 skipit2: test ecx, ecx jz endloop begloop : mov eax, dword ptr[esi-3] sub esi, 4 bswap eax mov dword ptr[edi], eax add edi, 4 dec ecx jnz begloop endloop : } } static __inline void qinterpolatedown16(int32_t a, int32_t c, int32_t d, int32_t s) { _asm { mov eax, a mov ecx, c mov edx, d mov esi, s mov ebx, ecx shr ecx, 1 jz skipbegcalc begqcalc : lea edi, [edx+esi] sar edx, 16 mov dword ptr[eax], edx lea edx, [edi+esi] sar edi, 16 mov dword ptr[eax+4], edi add eax, 8 dec ecx jnz begqcalc test ebx, 1 jz skipbegqcalc2 skipbegcalc : sar edx, 16 mov dword ptr[eax], edx skipbegqcalc2 : } } static __inline void qinterpolatedown16short(int32_t a, int32_t c, int32_t d, int32_t s) { _asm { mov eax, a mov ecx, c mov edx, d mov esi, s test ecx, ecx jz endit test al, 2 jz skipalignit mov ebx, edx sar ebx, 16 mov word ptr[eax], bx add edx, esi add eax, 2 dec ecx jz endit skipalignit : sub ecx, 2 jc finishit begqcalc : mov ebx, edx add edx, esi sar ebx, 16 mov edi, edx and edi, 0ffff0000h add edx, esi add ebx, edi mov dword ptr[eax], ebx add eax, 4 sub ecx, 2 jnc begqcalc test cl, 1 jz endit finishit : mov ebx, edx sar ebx, 16 mov word ptr[eax], bx endit : } } //returns eax/ebx, dmval = eax%edx; static __inline int32_t divmod(int32_t a, int32_t b) { _asm { mov eax, a xor edx, edx div b mov dmval, edx } } //returns eax%ebx, dmval = eax/edx; static __inline int32_t moddiv(int32_t a, int32_t b) { _asm { mov eax, a xor edx, edx div b mov dmval, eax mov eax, edx } } static __inline int32_t klabs(int32_t a) { _asm { mov eax, a test eax, eax jns skipnegate neg eax skipnegate : } } static __inline int32_t ksgn(int32_t b) { _asm { mov ebx, b add ebx, ebx sbb eax, eax cmp eax, ebx adc al, 0 } } //eax = (unsigned min)umin(eax,ebx) static __inline int32_t umin(int32_t a, int32_t b) { _asm { mov eax, a sub eax, b sbb ecx, ecx and eax, ecx add eax, b } } //eax = (unsigned max)umax(eax,ebx) static __inline int32_t umax(int32_t a, int32_t b) { _asm { mov eax, a sub eax, b sbb ecx, ecx xor ecx, 0xffffffff and eax, ecx add eax, b } } static __inline int32_t kmin(int32_t a, int32_t b) { _asm { mov eax, a mov ebx, b cmp eax, ebx jl skipit mov eax, ebx skipit : } } static __inline int32_t kmax(int32_t a, int32_t b) { _asm { mov eax, a mov ebx, b cmp eax, ebx jg skipit mov eax, ebx skipit : } } static __inline void swapchar(void *a, void *b) { _asm { mov eax, a mov ebx, b mov cl, [eax] mov ch, [ebx] mov[ebx], cl mov[eax], ch } } static __inline void swapshort(void *a, void *b) { _asm { mov eax, a mov ebx, b mov cx, [eax] mov dx, [ebx] mov[ebx], cx mov[eax], dx } } static __inline void swaplong(void *a, void *b) { _asm { mov eax, a mov ebx, b mov ecx, [eax] mov edx, [ebx] mov[ebx], ecx mov[eax], edx } } #define swapfloat swaplong static __inline void swapbuf4(void *a, void *b, int32_t c) { _asm { mov eax, a mov ebx, b mov ecx, c begswap : mov esi, [eax] mov edi, [ebx] mov[ebx], esi mov[eax], edi add eax, 4 add ebx, 4 dec ecx jnz short begswap } } static __inline void swap64bit(void *a, void *b) { _asm { mov eax, a mov ebx, b mov ecx, [eax] mov edx, [ebx] mov[ebx], ecx mov ecx, [eax+4] mov[eax], edx mov edx, [ebx+4] mov[ebx+4], ecx mov[eax+4], edx } } //swapchar2(ptr1,ptr2,xsiz); is the same as: //swapchar(ptr1,ptr2); swapchar(ptr1+1,ptr2+xsiz); static __inline void swapchar2(void *a, void *b, int32_t s) { _asm { mov eax, a mov ebx, b mov esi, s add esi, ebx mov cx, [eax] mov dl, [ebx] mov[ebx], cl mov dh, [esi] mov[esi], ch mov[eax], dx } } //}}} #endif // __pragmas_x86_h__ #endif // __pragmas_h__