mirror of
https://github.com/UberGames/lilium-voyager.git
synced 2024-12-13 21:51:09 +00:00
Implement Mathias Benthrup's suggestion for x86 ASM snapvector implementation which reduces cache misses.
This commit is contained in:
parent
98af5f4bb0
commit
c927fab58f
2 changed files with 22 additions and 18 deletions
|
@ -48,14 +48,15 @@ IFDEF idx64
|
||||||
stmxcsr [rsp] ; save SSE control word
|
stmxcsr [rsp] ; save SSE control word
|
||||||
ldmxcsr ssecw ; set to round nearest
|
ldmxcsr ssecw ; set to round nearest
|
||||||
|
|
||||||
push rdi
|
movaps xmm1, ssemask ; initialize the mask register
|
||||||
mov rdi, rcx ; maskmovdqu uses rdi as implicit memory operand
|
movups xmm0, [rcx] ; here is stored our vector. Read 4 values in one go
|
||||||
movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
|
movaps xmm2, xmm0 ; keep a copy of the original data
|
||||||
movups xmm0, [rdi] ; here is stored our vector. Read 4 values in one go
|
andps xmm0, xmm1 ; set the fourth value to zero in xmm0
|
||||||
|
andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
|
||||||
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
|
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
|
||||||
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
|
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
|
||||||
maskmovdqu xmm0, xmm1 ; write 3 values back to memory
|
orps xmm0, xmm1 ; combine all 4 values again
|
||||||
pop rdi
|
movups [rcx], xmm0 ; write 3 rounded and 1 unchanged values back to memory
|
||||||
|
|
||||||
ldmxcsr [rsp] ; restore sse control word to old value
|
ldmxcsr [rsp] ; restore sse control word to old value
|
||||||
add rsp, 8
|
add rsp, 8
|
||||||
|
@ -69,14 +70,16 @@ ELSE
|
||||||
stmxcsr [esp] ; save SSE control word
|
stmxcsr [esp] ; save SSE control word
|
||||||
ldmxcsr ssecw ; set to round nearest
|
ldmxcsr ssecw ; set to round nearest
|
||||||
|
|
||||||
push edi
|
mov eax, dword ptr 16[esp] ; store address of vector in eax
|
||||||
mov edi, dword ptr 16[esp] ; maskmovdqu uses edi as implicit memory operand
|
movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
|
||||||
movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
|
movups xmm0, [eax] ; here is stored our vector. Read 4 values in one go
|
||||||
movups xmm0, [edi] ; here is stored our vector. Read 4 values in one go
|
movaps xmm2, xmm0 ; keep a copy of the original data
|
||||||
|
andps xmm0, xmm1 ; set the fourth value to zero in xmm0
|
||||||
|
andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
|
||||||
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
|
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
|
||||||
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
|
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
|
||||||
maskmovdqu xmm0, xmm1 ; write 3 values back to memory
|
orps xmm0, xmm1 ; combine all 4 values again
|
||||||
pop edi
|
movups [eax], xmm0 ; write 3 rounded and 1 unchanged values back to memory
|
||||||
|
|
||||||
ldmxcsr [esp] ; restore sse control word to old value
|
ldmxcsr [esp] ; restore sse control word to old value
|
||||||
add esp, 8
|
add esp, 8
|
||||||
|
|
|
@ -47,17 +47,18 @@ void qsnapvectorsse(vec3_t vec)
|
||||||
|
|
||||||
"movaps (%0), %%xmm1\n"
|
"movaps (%0), %%xmm1\n"
|
||||||
"movups (%2), %%xmm0\n"
|
"movups (%2), %%xmm0\n"
|
||||||
|
"movaps %%xmm0, %%xmm2\n"
|
||||||
|
"andps %%xmm1, %%xmm0\n"
|
||||||
|
"andnps %%xmm2, %%xmm1\n"
|
||||||
"cvtps2dq %%xmm0, %%xmm0\n"
|
"cvtps2dq %%xmm0, %%xmm0\n"
|
||||||
"cvtdq2ps %%xmm0, %%xmm0\n"
|
"cvtdq2ps %%xmm0, %%xmm0\n"
|
||||||
// vec MUST reside in register rdi as maskmovdqu uses
|
"orps %%xmm1, %%xmm0\n"
|
||||||
// it as an implicit operand. The "D" constraint makes
|
"movups %%xmm0, (%2)\n"
|
||||||
// sure of that.
|
|
||||||
"maskmovdqu %%xmm1, %%xmm0\n"
|
|
||||||
|
|
||||||
"ldmxcsr %3\n"
|
"ldmxcsr %3\n"
|
||||||
:
|
:
|
||||||
: "r" (ssemask), "m" (ssecw), "D" (vec), "m" (oldcw)
|
: "r" (ssemask), "m" (ssecw), "r" (vec), "m" (oldcw)
|
||||||
: "memory", "%xmm0", "%xmm1"
|
: "memory", "%xmm0", "%xmm1", "%xmm2"
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue