2008-08-12 03:20:50 +00:00
|
|
|
%ifnidn __OUTPUT_FORMAT__,win64
|
|
|
|
%error tmap3.asm is for Win64 output. You should use tmap.s for other systems.
|
|
|
|
%endif
|
|
|
|
|
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
modifying code for maximum performance. The additional registers do allow
for further optimization over the x86 version by allowing all four pixels
to be in flight at the same time. The end result is that AMD64 ASM is about
2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
(For further comparison, AMD64 C and x86 C are practically the same for
this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
most likely candidate, but it's not used enough at this point to bother.
Also, this may or may not work with Linux at the moment, since it doesn't
have the eh_handler metadata. Win64 is easier, since I just need to
structure the function prologue and epilogue properly and use some
assembler directives/macros to automatically generate the metadata. And
that brings up another point: You need YASM to assemble the AMD64 code,
because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
VC++ still throws around unneccessary register moves. GCC seems to be
pretty close to optimal, requiring only about 2 cycles/color. They're
both faster than my hand-written MMX routine, so I don't need to feel
bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
instructions, and unrolled it once, shaving off about 80 cycles from the
time required to blend 256 palette entries. Why? Because I tried writing
a C version of the routine using compiler intrinsics and was appalled by
all the extra movq's VC++ added to the code. GCC was better, but still
generated extra instructions. I only wanted a C version because I can't
use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
of a pain. (It's a pain because Linux and Windows have different calling
conventions, and you need to maintain extra metadata for functions.) So,
the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
only for AMD processors, so we must not use it on other architectures, or
we end up overwriting the L1 cache line size with 0 or some other number
we don't actually understand.
SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
|
|
|
BITS 64
|
2008-08-12 03:20:50 +00:00
|
|
|
DEFAULT REL
|
|
|
|
|
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
modifying code for maximum performance. The additional registers do allow
for further optimization over the x86 version by allowing all four pixels
to be in flight at the same time. The end result is that AMD64 ASM is about
2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
(For further comparison, AMD64 C and x86 C are practically the same for
this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
most likely candidate, but it's not used enough at this point to bother.
Also, this may or may not work with Linux at the moment, since it doesn't
have the eh_handler metadata. Win64 is easier, since I just need to
structure the function prologue and epilogue properly and use some
assembler directives/macros to automatically generate the metadata. And
that brings up another point: You need YASM to assemble the AMD64 code,
because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
VC++ still throws around unneccessary register moves. GCC seems to be
pretty close to optimal, requiring only about 2 cycles/color. They're
both faster than my hand-written MMX routine, so I don't need to feel
bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
instructions, and unrolled it once, shaving off about 80 cycles from the
time required to blend 256 palette entries. Why? Because I tried writing
a C version of the routine using compiler intrinsics and was appalled by
all the extra movq's VC++ added to the code. GCC was better, but still
generated extra instructions. I only wanted a C version because I can't
use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
of a pain. (It's a pain because Linux and Windows have different calling
conventions, and you need to maintain extra metadata for functions.) So,
the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
only for AMD processors, so we must not use it on other architectures, or
we end up overwriting the L1 cache line size with 0 or some other number
we don't actually understand.
SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
|
|
|
EXTERN vplce
|
|
|
|
EXTERN vince
|
|
|
|
EXTERN palookupoffse
|
|
|
|
EXTERN bufplce
|
|
|
|
|
|
|
|
EXTERN dc_count
|
|
|
|
EXTERN dc_dest
|
|
|
|
EXTERN dc_pitch
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
GLOBAL ASM_PatchPitch
|
|
|
|
ASM_PatchPitch:
|
|
|
|
mov ecx, [dc_pitch]
|
|
|
|
mov [pm+3], ecx
|
|
|
|
mov [vltpitch+3], ecx
|
2008-08-12 03:20:50 +00:00
|
|
|
ret
|
2008-08-12 02:49:00 +00:00
|
|
|
align 16
|
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
modifying code for maximum performance. The additional registers do allow
for further optimization over the x86 version by allowing all four pixels
to be in flight at the same time. The end result is that AMD64 ASM is about
2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
(For further comparison, AMD64 C and x86 C are practically the same for
this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
most likely candidate, but it's not used enough at this point to bother.
Also, this may or may not work with Linux at the moment, since it doesn't
have the eh_handler metadata. Win64 is easier, since I just need to
structure the function prologue and epilogue properly and use some
assembler directives/macros to automatically generate the metadata. And
that brings up another point: You need YASM to assemble the AMD64 code,
because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
VC++ still throws around unneccessary register moves. GCC seems to be
pretty close to optimal, requiring only about 2 cycles/color. They're
both faster than my hand-written MMX routine, so I don't need to feel
bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
instructions, and unrolled it once, shaving off about 80 cycles from the
time required to blend 256 palette entries. Why? Because I tried writing
a C version of the routine using compiler intrinsics and was appalled by
all the extra movq's VC++ added to the code. GCC was better, but still
generated extra instructions. I only wanted a C version because I can't
use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
of a pain. (It's a pain because Linux and Windows have different calling
conventions, and you need to maintain extra metadata for functions.) So,
the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
only for AMD processors, so we must not use it on other architectures, or
we end up overwriting the L1 cache line size with 0 or some other number
we don't actually understand.
SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
|
|
|
|
|
|
|
GLOBAL setupvlinetallasm
|
|
|
|
setupvlinetallasm:
|
2008-08-12 02:49:00 +00:00
|
|
|
mov [shifter1+2], cl
|
|
|
|
mov [shifter2+2], cl
|
|
|
|
mov [shifter3+2], cl
|
|
|
|
mov [shifter4+2], cl
|
2008-08-12 03:20:50 +00:00
|
|
|
ret
|
2008-08-12 02:49:00 +00:00
|
|
|
align 16
|
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
modifying code for maximum performance. The additional registers do allow
for further optimization over the x86 version by allowing all four pixels
to be in flight at the same time. The end result is that AMD64 ASM is about
2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
(For further comparison, AMD64 C and x86 C are practically the same for
this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
most likely candidate, but it's not used enough at this point to bother.
Also, this may or may not work with Linux at the moment, since it doesn't
have the eh_handler metadata. Win64 is easier, since I just need to
structure the function prologue and epilogue properly and use some
assembler directives/macros to automatically generate the metadata. And
that brings up another point: You need YASM to assemble the AMD64 code,
because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
VC++ still throws around unneccessary register moves. GCC seems to be
pretty close to optimal, requiring only about 2 cycles/color. They're
both faster than my hand-written MMX routine, so I don't need to feel
bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
instructions, and unrolled it once, shaving off about 80 cycles from the
time required to blend 256 palette entries. Why? Because I tried writing
a C version of the routine using compiler intrinsics and was appalled by
all the extra movq's VC++ added to the code. GCC was better, but still
generated extra instructions. I only wanted a C version because I can't
use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
of a pain. (It's a pain because Linux and Windows have different calling
conventions, and you need to maintain extra metadata for functions.) So,
the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
only for AMD processors, so we must not use it on other architectures, or
we end up overwriting the L1 cache line size with 0 or some other number
we don't actually understand.
SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
|
|
|
|
|
|
|
; Yasm can't do progbits alloc exec for win64?
|
|
|
|
; Hmm, looks like it's automatic. No worries, then.
|
2008-08-12 02:49:00 +00:00
|
|
|
SECTION .rtext write ;progbits alloc exec
|
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
modifying code for maximum performance. The additional registers do allow
for further optimization over the x86 version by allowing all four pixels
to be in flight at the same time. The end result is that AMD64 ASM is about
2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
(For further comparison, AMD64 C and x86 C are practically the same for
this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
most likely candidate, but it's not used enough at this point to bother.
Also, this may or may not work with Linux at the moment, since it doesn't
have the eh_handler metadata. Win64 is easier, since I just need to
structure the function prologue and epilogue properly and use some
assembler directives/macros to automatically generate the metadata. And
that brings up another point: You need YASM to assemble the AMD64 code,
because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
VC++ still throws around unneccessary register moves. GCC seems to be
pretty close to optimal, requiring only about 2 cycles/color. They're
both faster than my hand-written MMX routine, so I don't need to feel
bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
instructions, and unrolled it once, shaving off about 80 cycles from the
time required to blend 256 palette entries. Why? Because I tried writing
a C version of the routine using compiler intrinsics and was appalled by
all the extra movq's VC++ added to the code. GCC was better, but still
generated extra instructions. I only wanted a C version because I can't
use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
of a pain. (It's a pain because Linux and Windows have different calling
conventions, and you need to maintain extra metadata for functions.) So,
the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
only for AMD processors, so we must not use it on other architectures, or
we end up overwriting the L1 cache line size with 0 or some other number
we don't actually understand.
SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
|
|
|
|
|
|
|
GLOBAL vlinetallasm4
|
|
|
|
PROC_FRAME vlinetallasm4
|
|
|
|
rex_push_reg rbx
|
|
|
|
push_reg rdi
|
|
|
|
push_reg r15
|
|
|
|
push_reg r14
|
|
|
|
push_reg r13
|
|
|
|
push_reg r12
|
|
|
|
push_reg rbp
|
|
|
|
push_reg rsi
|
|
|
|
alloc_stack 8 ; Stack must be 16-byte aligned
|
|
|
|
END_PROLOGUE
|
|
|
|
; rax = bufplce base address
|
|
|
|
; rbx =
|
|
|
|
; rcx = offset from rdi/count (negative)
|
|
|
|
; edx/rdx = scratch
|
|
|
|
; rdi = bottom of columns to write to
|
|
|
|
; r8d-r11d = column offsets
|
|
|
|
; r12-r15 = palookupoffse[0] - palookupoffse[4]
|
|
|
|
|
|
|
|
mov ecx, [dc_count]
|
|
|
|
mov rdi, [dc_dest]
|
|
|
|
test ecx, ecx
|
|
|
|
jle vltepilog ; count must be positive
|
|
|
|
|
|
|
|
mov rax, [bufplce]
|
|
|
|
mov r8, [bufplce+8]
|
|
|
|
sub r8, rax
|
|
|
|
mov r9, [bufplce+16]
|
|
|
|
sub r9, rax
|
|
|
|
mov r10, [bufplce+24]
|
|
|
|
sub r10, rax
|
|
|
|
mov [source2+4], r8d
|
|
|
|
mov [source3+4], r9d
|
|
|
|
mov [source4+4], r10d
|
|
|
|
|
|
|
|
pm: imul rcx, 320
|
|
|
|
|
|
|
|
mov r12, [palookupoffse]
|
|
|
|
mov r13, [palookupoffse+8]
|
|
|
|
mov r14, [palookupoffse+16]
|
|
|
|
mov r15, [palookupoffse+24]
|
|
|
|
|
|
|
|
mov r8d, [vince]
|
|
|
|
mov r9d, [vince+4]
|
|
|
|
mov r10d, [vince+8]
|
|
|
|
mov r11d, [vince+12]
|
|
|
|
mov [step1+3], r8d
|
|
|
|
mov [step2+3], r9d
|
|
|
|
mov [step3+3], r10d
|
|
|
|
mov [step4+3], r11d
|
|
|
|
|
|
|
|
add rdi, rcx
|
|
|
|
neg rcx
|
|
|
|
|
|
|
|
mov r8d, [vplce]
|
|
|
|
mov r9d, [vplce+4]
|
|
|
|
mov r10d, [vplce+8]
|
|
|
|
mov r11d, [vplce+12]
|
|
|
|
jmp loopit
|
|
|
|
|
|
|
|
ALIGN 16
|
|
|
|
loopit:
|
|
|
|
mov edx, r8d
|
|
|
|
shifter1: shr edx, 24
|
|
|
|
step1: add r8d, 0x88888888
|
2008-08-12 02:49:00 +00:00
|
|
|
movzx edx, BYTE [rax+rdx]
|
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
modifying code for maximum performance. The additional registers do allow
for further optimization over the x86 version by allowing all four pixels
to be in flight at the same time. The end result is that AMD64 ASM is about
2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
(For further comparison, AMD64 C and x86 C are practically the same for
this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
most likely candidate, but it's not used enough at this point to bother.
Also, this may or may not work with Linux at the moment, since it doesn't
have the eh_handler metadata. Win64 is easier, since I just need to
structure the function prologue and epilogue properly and use some
assembler directives/macros to automatically generate the metadata. And
that brings up another point: You need YASM to assemble the AMD64 code,
because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
VC++ still throws around unneccessary register moves. GCC seems to be
pretty close to optimal, requiring only about 2 cycles/color. They're
both faster than my hand-written MMX routine, so I don't need to feel
bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
instructions, and unrolled it once, shaving off about 80 cycles from the
time required to blend 256 palette entries. Why? Because I tried writing
a C version of the routine using compiler intrinsics and was appalled by
all the extra movq's VC++ added to the code. GCC was better, but still
generated extra instructions. I only wanted a C version because I can't
use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
of a pain. (It's a pain because Linux and Windows have different calling
conventions, and you need to maintain extra metadata for functions.) So,
the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
only for AMD processors, so we must not use it on other architectures, or
we end up overwriting the L1 cache line size with 0 or some other number
we don't actually understand.
SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
|
|
|
mov ebx, r9d
|
|
|
|
mov dl, [r12+rdx]
|
|
|
|
shifter2: shr ebx, 24
|
|
|
|
step2: add r9d, 0x88888888
|
|
|
|
source2: movzx ebx, BYTE [rax+rbx+0x88888888]
|
|
|
|
mov ebp, r10d
|
|
|
|
mov bl, [r13+rbx]
|
|
|
|
shifter3: shr ebp, 24
|
|
|
|
step3: add r10d, 0x88888888
|
|
|
|
source3: movzx ebp, BYTE [rax+rbp+0x88888888]
|
|
|
|
mov esi, r11d
|
|
|
|
mov bpl, BYTE [r14+rbp]
|
|
|
|
shifter4: shr esi, 24
|
|
|
|
step4: add r11d, 0x88888888
|
|
|
|
source4: movzx esi, BYTE [rax+rsi+0x88888888]
|
|
|
|
mov [rdi+rcx], dl
|
|
|
|
mov [rdi+rcx+1], bl
|
|
|
|
mov sil, BYTE [r15+rsi]
|
|
|
|
mov [rdi+rcx+2], bpl
|
|
|
|
mov [rdi+rcx+3], sil
|
|
|
|
|
|
|
|
vltpitch: add rcx, 320
|
|
|
|
jl loopit
|
|
|
|
|
|
|
|
mov [vplce], r8d
|
|
|
|
mov [vplce+4], r9d
|
|
|
|
mov [vplce+8], r10d
|
|
|
|
mov [vplce+12], r11d
|
|
|
|
|
|
|
|
vltepilog:
|
|
|
|
add rsp, 8
|
|
|
|
pop rsi
|
|
|
|
pop rbp
|
|
|
|
pop r12
|
|
|
|
pop r13
|
|
|
|
pop r14
|
|
|
|
pop r15
|
|
|
|
pop rdi
|
|
|
|
pop rbx
|
2008-08-12 03:20:50 +00:00
|
|
|
ret
|
2008-08-12 02:49:00 +00:00
|
|
|
vlinetallasm4_end:
|
2008-08-12 03:20:50 +00:00
|
|
|
ENDPROC_FRAME
|
2008-08-12 02:49:00 +00:00
|
|
|
ALIGN 16
|
2008-08-12 03:20:50 +00:00
|
|
|
|