From c927fab58fef33f537e97a471e10a02731df5537 Mon Sep 17 00:00:00 2001 From: Thilo Schulz Date: Mon, 19 Sep 2011 18:30:24 +0000 Subject: [PATCH] Implement Mathias Benthrup's suggestion for x86 ASM snapvector implementation which reduces cache misses. --- code/asm/snapvector.asm | 27 +++++++++++++++------------ code/asm/snapvector.c | 13 +++++++------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/code/asm/snapvector.asm b/code/asm/snapvector.asm index acfb7930..a953778a 100644 --- a/code/asm/snapvector.asm +++ b/code/asm/snapvector.asm @@ -48,14 +48,15 @@ IFDEF idx64 stmxcsr [rsp] ; save SSE control word ldmxcsr ssecw ; set to round nearest - push rdi - mov rdi, rcx ; maskmovdqu uses rdi as implicit memory operand - movaps xmm1, ssemask ; initialize the mask register for maskmovdqu - movups xmm0, [rdi] ; here is stored our vector. Read 4 values in one go + movaps xmm1, ssemask ; initialize the mask register + movups xmm0, [rcx] ; here is stored our vector. Read 4 values in one go + movaps xmm2, xmm0 ; keep a copy of the original data + andps xmm0, xmm1 ; set the fourth value to zero in xmm0 + andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero cvtps2dq xmm0, xmm0 ; convert 4 single fp to int cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp - maskmovdqu xmm0, xmm1 ; write 3 values back to memory - pop rdi + orps xmm0, xmm1 ; combine all 4 values again + movups [rcx], xmm0 ; write 3 rounded and 1 unchanged values back to memory ldmxcsr [rsp] ; restore sse control word to old value add rsp, 8 @@ -69,14 +70,16 @@ ELSE stmxcsr [esp] ; save SSE control word ldmxcsr ssecw ; set to round nearest - push edi - mov edi, dword ptr 16[esp] ; maskmovdqu uses edi as implicit memory operand - movaps xmm1, ssemask ; initialize the mask register for maskmovdqu - movups xmm0, [edi] ; here is stored our vector. Read 4 values in one go + mov eax, dword ptr 16[esp] ; store address of vector in eax + movaps xmm1, ssemask ; initialize the mask register for maskmovdqu + movups xmm0, [eax] ; here is stored our vector. Read 4 values in one go + movaps xmm2, xmm0 ; keep a copy of the original data + andps xmm0, xmm1 ; set the fourth value to zero in xmm0 + andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero cvtps2dq xmm0, xmm0 ; convert 4 single fp to int cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp - maskmovdqu xmm0, xmm1 ; write 3 values back to memory - pop edi + orps xmm0, xmm1 ; combine all 4 values again + movups [eax], xmm0 ; write 3 rounded and 1 unchanged values back to memory ldmxcsr [esp] ; restore sse control word to old value add esp, 8 diff --git a/code/asm/snapvector.c b/code/asm/snapvector.c index 8e9b2868..121c110e 100644 --- a/code/asm/snapvector.c +++ b/code/asm/snapvector.c @@ -47,17 +47,18 @@ void qsnapvectorsse(vec3_t vec) "movaps (%0), %%xmm1\n" "movups (%2), %%xmm0\n" + "movaps %%xmm0, %%xmm2\n" + "andps %%xmm1, %%xmm0\n" + "andnps %%xmm2, %%xmm1\n" "cvtps2dq %%xmm0, %%xmm0\n" "cvtdq2ps %%xmm0, %%xmm0\n" - // vec MUST reside in register rdi as maskmovdqu uses - // it as an implicit operand. The "D" constraint makes - // sure of that. - "maskmovdqu %%xmm1, %%xmm0\n" + "orps %%xmm1, %%xmm0\n" + "movups %%xmm0, (%2)\n" "ldmxcsr %3\n" : - : "r" (ssemask), "m" (ssecw), "D" (vec), "m" (oldcw) - : "memory", "%xmm0", "%xmm1" + : "r" (ssemask), "m" (ssecw), "r" (vec), "m" (oldcw) + : "memory", "%xmm0", "%xmm1", "%xmm2" ); }