mirror of
https://github.com/DrBeef/QuestZDoom.git
synced 2025-03-06 09:21:22 +00:00
389 lines
7.4 KiB
ArmAsm
389 lines
7.4 KiB
ArmAsm
/*
|
|
dct36_sse: SSE optimized dct36
|
|
|
|
copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
initially written by Taihei Monma
|
|
*/
|
|
|
|
#include "mangle.h"
|
|
|
|
#define in %edi
|
|
#define out1 %edi
|
|
#define out2 %edx
|
|
#define w %ecx
|
|
#define ts %eax
|
|
#define COS9_ %eax
|
|
#define tfcos36_ %edx
|
|
#define tmp %esi
|
|
|
|
/*
|
|
void dct36_sse(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
|
|
*/
|
|
|
|
#ifndef __APPLE__
|
|
.section .rodata
|
|
#else
|
|
.data
|
|
#endif
|
|
ALIGN16
|
|
dct36_sse_COS9:
|
|
.long 0x3f5db3d7
|
|
.long 0x3f5db3d7
|
|
.long 0x3f000000
|
|
.long 0x3f000000
|
|
.long 0x3f7c1c5c
|
|
.long 0x3f7c1c5c
|
|
.long 0x3f708fb2
|
|
.long 0x3f708fb2
|
|
.long 0x3f248dbb
|
|
.long 0x3f248dbb
|
|
.long 0x3e31d0d4
|
|
.long 0x3e31d0d4
|
|
.long 0x3eaf1d44
|
|
.long 0x3eaf1d44
|
|
.long 0x3f441b7d
|
|
.long 0x3f441b7d
|
|
ALIGN16
|
|
dct36_sse_tfcos36:
|
|
.long 0x3f007d2b
|
|
.long 0x3f0483ee
|
|
.long 0x3f0d3b7d
|
|
.long 0x3f1c4257
|
|
.long 0x40b79454
|
|
.long 0x3ff746ea
|
|
.long 0x3f976fd9
|
|
.long 0x3f5f2944
|
|
.long 0x3f3504f3
|
|
ALIGN16
|
|
dct36_sse_mask:
|
|
.long 0,0xffffffff,0,0xffffffff
|
|
ALIGN16
|
|
dct36_sse_sign:
|
|
.long 0x80000000,0x80000000,0x80000000,0x80000000
|
|
.text
|
|
ALIGN16
|
|
.globl ASM_NAME(dct36_sse)
|
|
ASM_NAME(dct36_sse):
|
|
push %ebp
|
|
mov %esp, %ebp
|
|
and $-16, %esp
|
|
sub $80, %esp
|
|
push %ebx
|
|
push %esi
|
|
push %edi
|
|
call 1f
|
|
1:
|
|
pop %ebx
|
|
lea dct36_sse_COS9-1b(%ebx), COS9_
|
|
lea dct36_sse_tfcos36-1b(%ebx), tfcos36_
|
|
lea 12(%esp), tmp
|
|
movl 8(%ebp), in
|
|
|
|
xorps %xmm0, %xmm0
|
|
xorps %xmm5, %xmm5
|
|
movlps 64(in), %xmm5
|
|
movups 48(in), %xmm4
|
|
movups 32(in), %xmm3
|
|
movups 16(in), %xmm2
|
|
movups (in), %xmm1
|
|
movaps %xmm5, %xmm6
|
|
shufps $0xe1, %xmm6, %xmm6
|
|
movaps %xmm4, %xmm7
|
|
shufps $0x93, %xmm7, %xmm7
|
|
movss %xmm7, %xmm6
|
|
addps %xmm6, %xmm5
|
|
movaps %xmm3, %xmm6
|
|
shufps $0x93, %xmm6, %xmm6
|
|
movss %xmm6, %xmm7
|
|
addps %xmm7, %xmm4
|
|
movaps %xmm2, %xmm7
|
|
shufps $0x93, %xmm7, %xmm7
|
|
movss %xmm7, %xmm6
|
|
addps %xmm6, %xmm3
|
|
movaps %xmm1, %xmm6
|
|
shufps $0x93, %xmm6, %xmm6
|
|
movss %xmm6, %xmm7
|
|
addps %xmm7, %xmm2
|
|
movss %xmm0, %xmm6
|
|
addps %xmm6, %xmm1
|
|
|
|
movaps dct36_sse_mask-1b(%ebx), %xmm0
|
|
movaps %xmm4, %xmm6
|
|
shufps $0x4e, %xmm5, %xmm4
|
|
movaps %xmm3, %xmm7
|
|
shufps $0x4e, %xmm6, %xmm3
|
|
andps %xmm0, %xmm6
|
|
addps %xmm6, %xmm4
|
|
movaps %xmm2, %xmm6
|
|
shufps $0x4e, %xmm7, %xmm2
|
|
andps %xmm0, %xmm7
|
|
addps %xmm7, %xmm3
|
|
movaps %xmm1, %xmm7
|
|
shufps $0x4e, %xmm6, %xmm1
|
|
andps %xmm0, %xmm6
|
|
addps %xmm6, %xmm2
|
|
movaps %xmm7, %xmm6
|
|
andps %xmm0, %xmm7
|
|
xorps %xmm0, %xmm0
|
|
addps %xmm7, %xmm1
|
|
movlhps %xmm6, %xmm0
|
|
|
|
/*
|
|
xmm0 in[-,-,0,1]
|
|
xmm1 in[2,3,4,5]
|
|
xmm2 in[6,7,8,9]
|
|
xmm3 in[10,11,12,13]
|
|
xmm4 in[14,15,16,17]
|
|
*/
|
|
|
|
movaps %xmm2, %xmm5
|
|
shufps $0xe4, %xmm3, %xmm5
|
|
shufps $0xe4, %xmm4, %xmm3
|
|
shufps $0xe4, %xmm2, %xmm4
|
|
movaps %xmm5, %xmm2
|
|
|
|
/*
|
|
xmm2 in[6,7,12,13]
|
|
xmm3 in[10,11,16,17]
|
|
xmm4 in[14,15,8,9]
|
|
*/
|
|
|
|
mulps (COS9_), %xmm5
|
|
addps %xmm0, %xmm5
|
|
|
|
movaps %xmm0, (tmp)
|
|
movaps %xmm2, 16(tmp)
|
|
|
|
/*
|
|
0(tmp) in[-,-,0,1]
|
|
xmm5 [ta33,tb33,ta66,tb66]
|
|
*/
|
|
|
|
movaps %xmm1, %xmm6
|
|
subps %xmm3, %xmm6
|
|
subps %xmm4, %xmm6
|
|
xorps %xmm7, %xmm7
|
|
shufps $0xe0, %xmm2, %xmm7
|
|
mulps (COS9_), %xmm6
|
|
subps %xmm7, %xmm0
|
|
addps %xmm0, %xmm6
|
|
movaps %xmm6, 48(tmp)
|
|
|
|
movaps 16(COS9_), %xmm2
|
|
|
|
movaps %xmm1, %xmm0
|
|
movaps %xmm3, %xmm6
|
|
movaps %xmm4, %xmm7
|
|
mulps %xmm2, %xmm0
|
|
mulps 32(COS9_), %xmm6
|
|
mulps 48(COS9_), %xmm7
|
|
addps %xmm5, %xmm0
|
|
addps %xmm7, %xmm6
|
|
addps %xmm6, %xmm0
|
|
movaps %xmm0, 32(tmp)
|
|
|
|
movaps %xmm1, %xmm0
|
|
movaps %xmm3, %xmm6
|
|
movaps %xmm4, %xmm7
|
|
mulps 32(COS9_), %xmm0
|
|
mulps 48(COS9_), %xmm6
|
|
mulps %xmm2, %xmm7
|
|
subps %xmm5, %xmm0
|
|
subps %xmm6, %xmm7
|
|
addps %xmm7, %xmm0
|
|
movaps %xmm0, 64(tmp)
|
|
|
|
movaps %xmm1, %xmm6
|
|
movaps %xmm4, %xmm7
|
|
mulps 48(COS9_), %xmm6
|
|
mulps %xmm3, %xmm2
|
|
mulps 32(COS9_), %xmm7
|
|
subps %xmm5, %xmm6
|
|
subps %xmm7, %xmm2
|
|
addps %xmm2, %xmm6
|
|
|
|
movaps (tmp), %xmm0
|
|
movss 32(tfcos36_), %xmm5
|
|
subps %xmm1, %xmm0
|
|
subps 16(tmp), %xmm4
|
|
addps %xmm3, %xmm0
|
|
addps %xmm4, %xmm0
|
|
shufps $0xaf, %xmm0, %xmm0
|
|
mulss %xmm5, %xmm0
|
|
movaps %xmm0, (tmp)
|
|
|
|
movaps 32(tmp), %xmm0
|
|
movaps 48(tmp), %xmm1
|
|
movaps 64(tmp), %xmm2
|
|
|
|
/*
|
|
xmm0 [1a-0,1b-0, 2a-0, 2b-0]
|
|
xmm1 [1a-1,1b-1, 2a-1, 2b-1]
|
|
xmm2 [1a-2,1b-2,-2a-2,-2b-2]
|
|
xmm6 [1a-3,1b-3,-2a-3,-2b-3]
|
|
*/
|
|
|
|
movaps %xmm0, %xmm3
|
|
unpcklps %xmm1, %xmm0
|
|
unpckhps %xmm1, %xmm3
|
|
movaps %xmm2, %xmm5
|
|
unpcklps %xmm6, %xmm2
|
|
unpckhps %xmm6, %xmm5
|
|
xorps dct36_sse_sign-1b(%ebx), %xmm5
|
|
|
|
/*
|
|
xmm0 [1a-0,1a-1,1b-0,1b-1]
|
|
xmm3 [2a-0,2a-1,2b-0,2b-1]
|
|
xmm2 [1a-2,1a-3,1b-2,1b-3]
|
|
xmm5 [2a-2,2a-3,2b-2,2b-3]
|
|
*/
|
|
|
|
movaps %xmm0, %xmm1
|
|
movlhps %xmm2, %xmm0
|
|
movhlps %xmm1, %xmm2
|
|
movaps %xmm3, %xmm4
|
|
movlhps %xmm5, %xmm3
|
|
movhlps %xmm4, %xmm5
|
|
|
|
/*
|
|
xmm0 tmp1a
|
|
xmm3 tmp2a
|
|
xmm2 tmp1b
|
|
xmm5 tmp2b
|
|
*/
|
|
|
|
movaps (tfcos36_), %xmm6
|
|
movaps 16(tfcos36_), %xmm7
|
|
movaps %xmm5, %xmm1
|
|
addps %xmm2, %xmm5
|
|
subps %xmm2, %xmm1
|
|
movaps %xmm3, %xmm2
|
|
addps %xmm0, %xmm3
|
|
subps %xmm0, %xmm2
|
|
mulps %xmm6, %xmm5
|
|
mulps %xmm1, %xmm7
|
|
|
|
movaps %xmm2, 16(tmp)
|
|
|
|
/*
|
|
%xmm3 tmp[0,1,2,3]
|
|
%xmm5 tmp[17,16,15,14]
|
|
16(tmp) tmp[8,7,6,5]
|
|
%xmm7 tmp[9,10,11,12]
|
|
0(tmp) tmp[13,-,4,-]
|
|
*/
|
|
|
|
movl 12(%ebp), out1
|
|
movl 16(%ebp), out2
|
|
movl 20(%ebp), w
|
|
movl 24(%ebp), ts
|
|
|
|
movaps %xmm3, %xmm0
|
|
movaps %xmm5, %xmm1
|
|
movups 108(w), %xmm2
|
|
movups 92(w), %xmm3
|
|
shufps $0x1b, %xmm3, %xmm3
|
|
movups 36(w), %xmm4
|
|
movups 20(w), %xmm5
|
|
shufps $0x1b, %xmm5, %xmm5
|
|
movaps %xmm0, %xmm6
|
|
addps %xmm1, %xmm0
|
|
subps %xmm1, %xmm6
|
|
mulps %xmm0, %xmm2
|
|
mulps %xmm3, %xmm0
|
|
mulps %xmm6, %xmm4
|
|
mulps %xmm5, %xmm6
|
|
movups 36(out1), %xmm1
|
|
movups 20(out1), %xmm3
|
|
shufps $0x1b, %xmm6, %xmm6
|
|
addps %xmm4, %xmm1
|
|
addps %xmm6, %xmm3
|
|
shufps $0x1b, %xmm0, %xmm0
|
|
movups %xmm2, 36(out2)
|
|
movups %xmm0, 20(out2)
|
|
movss %xmm1, 32*36(ts)
|
|
movss %xmm3, 32*20(ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*44(ts)
|
|
movss %xmm4, 32*28(ts)
|
|
shufps $0xb1, %xmm1, %xmm1
|
|
shufps $0xb1, %xmm3, %xmm3
|
|
movss %xmm1, 32*40(ts)
|
|
movss %xmm3, 32*24(ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*48(ts)
|
|
movss %xmm4, 32*32(ts)
|
|
|
|
movss 8(tmp), %xmm0
|
|
movss (tmp), %xmm1
|
|
movss 124(w), %xmm2
|
|
movss 88(w), %xmm3
|
|
movss 52(w), %xmm4
|
|
movss 16(w), %xmm5
|
|
movss %xmm0, %xmm6
|
|
addss %xmm1, %xmm0
|
|
subss %xmm1, %xmm6
|
|
mulss %xmm0, %xmm2
|
|
mulss %xmm3, %xmm0
|
|
mulss %xmm6, %xmm4
|
|
mulss %xmm5, %xmm6
|
|
addss 52(out1), %xmm4
|
|
addss 16(out1), %xmm6
|
|
movss %xmm2, 52(out2)
|
|
movss %xmm0, 16(out2)
|
|
movss %xmm4, 32*52(ts)
|
|
movss %xmm6, 32*16(ts)
|
|
|
|
movaps 16(tmp), %xmm0
|
|
movaps %xmm7, %xmm1
|
|
MOVUAPS 128(w), %xmm2
|
|
movups 72(w), %xmm3
|
|
shufps $0x1b, %xmm2, %xmm2
|
|
movlps 56(w), %xmm4
|
|
movhps 64(w), %xmm4
|
|
MOVUAPS (w), %xmm5
|
|
shufps $0x1b, %xmm4, %xmm4
|
|
movaps %xmm0, %xmm6
|
|
addps %xmm1, %xmm0
|
|
subps %xmm1, %xmm6
|
|
mulps %xmm0, %xmm2
|
|
mulps %xmm3, %xmm0
|
|
mulps %xmm6, %xmm4
|
|
mulps %xmm5, %xmm6
|
|
movlps 56(out1), %xmm1
|
|
movhps 64(out1), %xmm1
|
|
movups (out1), %xmm3
|
|
shufps $0x1b, %xmm4, %xmm4
|
|
addps %xmm6, %xmm3
|
|
addps %xmm4, %xmm1
|
|
shufps $0x1b, %xmm2, %xmm2
|
|
movups %xmm0, (out2)
|
|
movlps %xmm2, 56(out2)
|
|
movhps %xmm2, 64(out2)
|
|
movss %xmm1, 32*56(ts)
|
|
movss %xmm3, (ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*64(ts)
|
|
movss %xmm4, 32*8(ts)
|
|
shufps $0xb1, %xmm1, %xmm1
|
|
shufps $0xb1, %xmm3, %xmm3
|
|
movss %xmm1, 32*60(ts)
|
|
movss %xmm3, 32*4(ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*68(ts)
|
|
movss %xmm4, 32*12(ts)
|
|
|
|
pop %edi
|
|
pop %esi
|
|
pop %ebx
|
|
mov %ebp, %esp
|
|
pop %ebp
|
|
|
|
ret
|
|
|
|
NONEXEC_STACK
|