mirror of
https://github.com/Q3Rally-Team/rallyunlimited-engine.git
synced 2024-11-25 05:31:16 +00:00
119 lines
No EOL
2 KiB
NASM
119 lines
No EOL
2 KiB
NASM
|
|
; this is mostly copy-pasted x86-version with x86_64-related fixes and comments
|
|
; runs about 4x times faster than C version
|
|
; saves about 3M ticks each second
|
|
|
|
.code
|
|
|
|
S_WriteLinearBlastStereo16_SSE_x64 PROC
|
|
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
|
|
mov rsi, rcx ; snd_p
|
|
mov rdi, rdx ; snd_out
|
|
mov rbx, r8 ; snd_linear_count
|
|
|
|
test ebx,ebx
|
|
jz LExit
|
|
mov ecx,esi ; try to align source memory to 64 bytes boundary
|
|
and ecx,63
|
|
jz LMain ; if already aligned - goto main loop
|
|
and ecx,3
|
|
jnz LTail ; if not dword-aligned - tail work & exit, should never happen though
|
|
shr ecx,2 ; get dword count to align source data
|
|
not ecx
|
|
add ecx,17
|
|
|
|
LClamp1:
|
|
mov eax,[rsi]
|
|
sar eax,8
|
|
cmp eax,32767
|
|
jg LClampHigh1
|
|
cmp eax,-32768
|
|
jnl LClampDone1
|
|
mov eax,-32768
|
|
jmp LClampDone1
|
|
LClampHigh1:
|
|
mov eax,32767
|
|
LClampDone1:
|
|
mov [rdi],ax
|
|
add rsi,4
|
|
add rdi,2
|
|
dec ebx
|
|
jz LExit ; check if nothing left during alignment
|
|
dec ecx
|
|
jnz LClamp1
|
|
|
|
LMain:
|
|
mov ecx,ebx
|
|
shr ecx,4
|
|
jz LTail ; not enough 64-byte blocks
|
|
and ebx, 15 ; remainder for tail job
|
|
|
|
LAgain:
|
|
movq mm0, qword ptr [rsi+ 0]
|
|
movq mm1, qword ptr [rsi+ 8]
|
|
movq mm2, qword ptr [rsi+16]
|
|
movq mm3, qword ptr [rsi+24]
|
|
movq mm4, qword ptr [rsi+32]
|
|
movq mm5, qword ptr [rsi+40]
|
|
movq mm6, qword ptr [rsi+48]
|
|
movq mm7, qword ptr [rsi+56]
|
|
psrad mm0,8
|
|
psrad mm1,8
|
|
psrad mm2,8
|
|
psrad mm3,8
|
|
psrad mm4,8
|
|
psrad mm5,8
|
|
psrad mm6,8
|
|
psrad mm7,8
|
|
packssdw mm0, mm1
|
|
packssdw mm2, mm3
|
|
packssdw mm4, mm5
|
|
packssdw mm6, mm7
|
|
movntq qword ptr [rdi+ 0], mm0
|
|
movntq qword ptr [rdi+ 8], mm2
|
|
movntq qword ptr [rdi+16], mm4
|
|
movntq qword ptr [rdi+24], mm6
|
|
add rsi, 64
|
|
add rdi, 32
|
|
dec ecx
|
|
jnz LAgain
|
|
|
|
LTail:
|
|
test ebx, ebx
|
|
jz LEnd
|
|
|
|
LClamp2:
|
|
mov eax,[rsi]
|
|
sar eax,8
|
|
cmp eax,32767
|
|
jg LClampHigh2
|
|
cmp eax,-32768
|
|
jnl LClampDone2
|
|
mov eax,-32768
|
|
jmp LClampDone2
|
|
LClampHigh2:
|
|
mov eax,32767
|
|
LClampDone2:
|
|
mov [rdi],ax
|
|
add rsi,4
|
|
add rdi,2
|
|
dec ebx
|
|
jnz LClamp2
|
|
|
|
LEnd:
|
|
sfence
|
|
emms
|
|
|
|
LExit:
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
ret
|
|
|
|
S_WriteLinearBlastStereo16_SSE_x64 ENDP
|
|
|
|
END |