rallyunlimited-engine/code/asm/snd_mix_x64.asm
2024-02-02 19:46:17 +03:00

119 lines
No EOL
2 KiB
NASM

; this is mostly copy-pasted x86-version with x86_64-related fixes and comments
; runs about 4x times faster than C version
; saves about 3M ticks each second
.code
S_WriteLinearBlastStereo16_SSE_x64 PROC
push rsi
push rdi
push rbx
mov rsi, rcx ; snd_p
mov rdi, rdx ; snd_out
mov rbx, r8 ; snd_linear_count
test ebx,ebx
jz LExit
mov ecx,esi ; try to align source memory to 64 bytes boundary
and ecx,63
jz LMain ; if already aligned - goto main loop
and ecx,3
jnz LTail ; if not dword-aligned - tail work & exit, should never happen though
shr ecx,2 ; get dword count to align source data
not ecx
add ecx,17
LClamp1:
mov eax,[rsi]
sar eax,8
cmp eax,32767
jg LClampHigh1
cmp eax,-32768
jnl LClampDone1
mov eax,-32768
jmp LClampDone1
LClampHigh1:
mov eax,32767
LClampDone1:
mov [rdi],ax
add rsi,4
add rdi,2
dec ebx
jz LExit ; check if nothing left during alignment
dec ecx
jnz LClamp1
LMain:
mov ecx,ebx
shr ecx,4
jz LTail ; not enough 64-byte blocks
and ebx, 15 ; remainder for tail job
LAgain:
movq mm0, qword ptr [rsi+ 0]
movq mm1, qword ptr [rsi+ 8]
movq mm2, qword ptr [rsi+16]
movq mm3, qword ptr [rsi+24]
movq mm4, qword ptr [rsi+32]
movq mm5, qword ptr [rsi+40]
movq mm6, qword ptr [rsi+48]
movq mm7, qword ptr [rsi+56]
psrad mm0,8
psrad mm1,8
psrad mm2,8
psrad mm3,8
psrad mm4,8
psrad mm5,8
psrad mm6,8
psrad mm7,8
packssdw mm0, mm1
packssdw mm2, mm3
packssdw mm4, mm5
packssdw mm6, mm7
movntq qword ptr [rdi+ 0], mm0
movntq qword ptr [rdi+ 8], mm2
movntq qword ptr [rdi+16], mm4
movntq qword ptr [rdi+24], mm6
add rsi, 64
add rdi, 32
dec ecx
jnz LAgain
LTail:
test ebx, ebx
jz LEnd
LClamp2:
mov eax,[rsi]
sar eax,8
cmp eax,32767
jg LClampHigh2
cmp eax,-32768
jnl LClampDone2
mov eax,-32768
jmp LClampDone2
LClampHigh2:
mov eax,32767
LClampDone2:
mov [rdi],ax
add rsi,4
add rdi,2
dec ebx
jnz LClamp2
LEnd:
sfence
emms
LExit:
pop rbx
pop rdi
pop rsi
ret
S_WriteLinearBlastStereo16_SSE_x64 ENDP
END