diff --git a/docs/rh-log.txt b/docs/rh-log.txt index 0a6ac35dba..28ae704cac 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,3 +1,11 @@ +August 11, 2008 +- Ported asm_x86_64/tmap3.nas to AT&T syntax so it can be compiled with gas. + After finding out that gas does have directives to describe the .eh_frame + metadata, I figured that would be significantly easier and quicker than + trying to locate all the scattered docs needed to construct it by hand. + Unfortunately, this now means I have to maintain two versions of exactly + the same code. :( + August 11, 2008 (Changes by Graf Zahl) - Removed 'eval' modifier from DECORATE. All int, float and bool parameters are 'eval' now by default. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 804f91f9e8..52aabec8bd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -168,24 +168,35 @@ endif( FMOD_LIBRARY ) # Search for NASM if( NOT NO_ASM ) - find_program( NASM_PATH NAMES ${NASM_NAMES} ) - find_program( YASM_PATH yasm ) - - if( YASM_PATH ) - set( ASSEMBLER ${YASM_PATH} ) - else( YASM_PATH ) - if( X64 ) - message( STATUS "Could not find YASM. Disabling assembly code." ) + if( UNIX AND X64 ) + find_program( GAS_PATH as ) + + if( GAS_PATH ) + set( ASSEMBLER ${GAS_PATH} ) + else( GAS_PATH ) + message( STATUS "Could not find as. Disabling assembly code." ) set( NO_ASM ON ) - else( X64 ) - if( NOT NASM_PATH ) - message( STATUS "Could not find YASM or NASM. Disabling assembly code." ) + endif( GAS_PATH ) + else( UNIX AND X64 ) + find_program( NASM_PATH NAMES ${NASM_NAMES} ) + find_program( YASM_PATH yasm ) + + if( YASM_PATH ) + set( ASSEMBLER ${YASM_PATH} ) + else( YASM_PATH ) + if( X64 ) + message( STATUS "Could not find YASM. Disabling assembly code." ) set( NO_ASM ON ) - else( NOT NASM_PATH ) - set( ASSEMBLER ${NASM_PATH} ) - endif( NOT NASM_PATH ) - endif( X64 ) - endif( YASM_PATH ) + else( X64 ) + if( NOT NASM_PATH ) + message( STATUS "Could not find YASM or NASM. Disabling assembly code." ) + set( NO_ASM ON ) + else( NOT NASM_PATH ) + set( ASSEMBLER ${NASM_PATH} ) + endif( NOT NASM_PATH ) + endif( X64 ) + endif( YASM_PATH ) + endif( UNIX AND X64 ) # I think the only reason there was a version requirement was because the # executable name for Windows changed from 0.x to 2.0, right? This is @@ -211,16 +222,19 @@ if( NOT NO_ASM ) if( UNIX ) set( ASM_OUTPUT_EXTENSION .o ) if( X64 ) - set( ASM_FLAGS -f elf64 -DM_TARGET_LINUX ) + set( ASM_FLAGS ) + set( ASM_SOURCE_EXTENSION .s ) else( X64 ) - set( ASM_FLAGS -f elf -DM_TARGET_LINUX ) + set( ASM_FLAGS -f elf -DM_TARGET_LINUX -i${CMAKE_CURRENT_SOURCE_DIR}/ ) + set( ASM_SOURCE_EXTENSION .asm ) endif( X64 ) else( UNIX ) set( ASM_OUTPUT_EXTENSION .obj ) + set( ASM_SOURCE_EXTENSION .asm ) if( X64 ) set( ASM_FLAGS -f win64 -DWIN32 -DWIN64 ) else( X64 ) - set( ASM_FLAGS -f win32 -DWIN32 ) + set( ASM_FLAGS -f win32 -DWIN32 -i${CMAKE_CURRENT_SOURCE_DIR}/ ) endif( X64 ) endif( UNIX ) if( WIN32 ) @@ -234,7 +248,7 @@ if( NOT NO_ASM ) endif( WIN32 ) add_custom_command( OUTPUT ${ASM_OUTPUT_${infile}} COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/zdoom.dir/${indir} - COMMAND ${ASSEMBLER} ${ASM_FLAGS} -i${CMAKE_CURRENT_SOURCE_DIR}/ -o"${ASM_OUTPUT_${infile}}" "${CMAKE_CURRENT_SOURCE_DIR}/${indir}/${infile}.asm" + COMMAND ${ASSEMBLER} ${ASM_FLAGS} -o"${ASM_OUTPUT_${infile}}" "${CMAKE_CURRENT_SOURCE_DIR}/${indir}/${infile}${ASM_SOURCE_EXTENSION}" ${FIXRTEXT_${infile}} DEPENDS ${indir}/${infile}.asm ${FIXRTEXT} ) set( ASM_SOURCES ${ASM_SOURCES} "${ASM_OUTPUT_${infile}}" ) diff --git a/src/asm_x86_64/tmap3.asm b/src/asm_x86_64/tmap3.asm index f1b8a31bea..120bf5ae9f 100644 --- a/src/asm_x86_64/tmap3.asm +++ b/src/asm_x86_64/tmap3.asm @@ -1,36 +1,9 @@ -%include "valgrind.inc" - +%ifnidn __OUTPUT_FORMAT__,win64 +%error tmap3.asm is for Win64 output. You should use tmap.s for other systems. +%endif + BITS 64 -DEFAULT REL - -%ifnidn __OUTPUT_FORMAT__,win64 - -%macro PROC_FRAME 1 -%1: -%endmacro - -%macro rex_push_reg 1 - push %1 -%endmacro - -%macro push_reg 1 - push %1 -%endmacro - -%macro alloc_stack 1 - sub rsp,%1 -%endmacro - -%define parm1lo dil - -%else - -%define parm1lo cl - -%endif - -SECTION .data - +DEFAULT REL EXTERN vplce EXTERN vince EXTERN palookupoffse @@ -42,34 +15,28 @@ EXTERN dc_pitch SECTION .text -ALIGN 16 GLOBAL ASM_PatchPitch ASM_PatchPitch: mov ecx, [dc_pitch] mov [pm+3], ecx mov [vltpitch+3], ecx selfmod pm, vltpitch+6 - ret + ret + align 16 -ALIGN 16 GLOBAL setupvlinetallasm setupvlinetallasm: - mov [shifter1+2], parm1lo - mov [shifter2+2], parm1lo - mov [shifter3+2], parm1lo - mov [shifter4+2], parm1lo + mov [shifter1+2], cl + mov [shifter2+2], cl + mov [shifter3+2], cl + mov [shifter4+2], cl selfmod shifter1, shifter4+3 - ret + ret + align 16 -%ifidn __OUTPUT_FORMAT__,win64 ; Yasm can't do progbits alloc exec for win64? ; Hmm, looks like it's automatic. No worries, then. - SECTION .rtext write ;progbits alloc exec -%else - SECTION .rtext progbits alloc exec write -%endif - -ALIGN 16 +SECTION .rtext write ;progbits alloc exec GLOBAL vlinetallasm4 PROC_FRAME vlinetallasm4 @@ -138,7 +105,7 @@ loopit: mov edx, r8d shifter1: shr edx, 24 step1: add r8d, 0x88888888 - movzx rdx, BYTE [rax+rdx] + movzx edx, BYTE [rax+rdx] mov ebx, r9d mov dl, [r12+rdx] shifter2: shr ebx, 24 @@ -178,5 +145,8 @@ vltepilog: pop r15 pop rdi pop rbx - ret -ENDPROC_FRAME + ret +vlinetallasm4_end: +ENDPROC_FRAME + ALIGN 16 + diff --git a/src/asm_x86_64/tmap3.s b/src/asm_x86_64/tmap3.s new file mode 100644 index 0000000000..8a9b52e48f --- /dev/null +++ b/src/asm_x86_64/tmap3.s @@ -0,0 +1,141 @@ +#%include "valgrind.inc" + + .section .text + +.globl ASM_PatchPitch +ASM_PatchPitch: + movl dc_pitch(%rip), %ecx + movl %ecx, pm+3(%rip) + movl %ecx, vltpitch+3(%rip) +# selfmod pm, vltpitch+6 + ret + .align 16 + +.globl setupvlinetallasm +setupvlinetallasm: + movb %dil, shifter1+2(%rip) + movb %dil, shifter2+2(%rip) + movb %dil, shifter3+2(%rip) + movb %dil, shifter4+2(%rip) +# selfmod shifter1, shifter4+3 + ret + .align 16 + + .section .rtext,"awx" + +.globl vlinetallasm4 + .type vlinetallasm4,@function +vlinetallasm4: + .cfi_startproc + push %rbx + push %rdi + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rsi + subq $8, %rsp # Does the stack need to be 16-byte aligned for Linux? + .cfi_adjust_cfa_offset 8 + +# rax = bufplce base address +# rbx = +# rcx = offset from rdi/count (negative) +# edx/rdx = scratch +# rdi = bottom of columns to write to +# r8d-r11d = column offsets +# r12-r15 = palookupoffse[0] - palookupoffse[4] + + movl dc_count(%rip), %ecx + movq dc_dest(%rip), %rdi + testl %ecx, %ecx + jle vltepilog # count must be positive + + movq bufplce(%rip), %rax + movq bufplce+8(%rip), %r8 + subq %rax, %r8 + movq bufplce+16(%rip), %r9 + subq %rax, %r9 + movq bufplce+24(%rip), %r10 + subq %rax, %r10 + movl %r8d, source2+4(%rip) + movl %r9d, source3+4(%rip) + movl %r10d, source4+4(%rip) + +pm: imulq $320, %rcx + + movq palookupoffse(%rip), %r12 + movq palookupoffse+8(%rip), %r13 + movq palookupoffse+16(%rip), %r14 + movq palookupoffse+24(%rip), %r15 + + movl vince(%rip), %r8d + movl vince+4(%rip), %r9d + movl vince+8(%rip), %r10d + movl vince+12(%rip), %r11d + movl %r8d, step1+3(%rip) + movl %r9d, step2+3(%rip) + movl %r10d, step3+3(%rip) + movl %r11d, step4+3(%rip) + + addq %rcx, %rdi + negq %rcx + + movl vplce(%rip), %r8d + movl vplce+4(%rip), %r9d + movl vplce+8(%rip), %r10d + movl vplce+12(%rip), %r11d +# selfmod loopit, vltepilog + jmp loopit + + .align 16 +loopit: + movl %r8d, %edx +shifter1: shrl $24, %edx +step1: addl $0x88888888, %r8d + movzbl (%rax,%rdx), %edx + movl %r9d, %ebx + movb (%r12,%rdx), %dl +shifter2: shrl $24, %ebx +step2: addl $0x88888888, %r9d +source2: movzbl 0x88888888(%rax,%rbx), %ebx + movl %r10d, %ebp + movb (%r13,%rbx), %bl +shifter3: shr $24, %ebp +step3: addl $0x88888888, %r10d +source3: movzbl 0x88888888(%rax,%rbp), %ebp + movl %r11d, %esi + movb (%r14,%rbp), %bpl +shifter4: shr $24, %esi +step4: add $0x88888888, %r11d +source4: movzbl 0x88888888(%rax,%rsi), %esi + movb %dl, (%rdi,%rcx) + movb %bl, 1(%rdi,%rcx) + movb (%r15,%rsi), %sil + movb %bpl, 2(%rdi,%rcx) + movb %sil, 3(%rdi,%rcx) + +vltpitch: addq $320, %rcx + jl loopit + + movl %r8d, vplce(%rip) + movl %r9d, vplce+4(%rip) + movl %r10d, vplce+8(%rip) + movl %r11d, vplce+12(%rip) + +vltepilog: + addq $8, %rsp + .cfi_adjust_cfa_offset -8 + pop %rsi + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + pop %rdi + pop %rbx + ret + .cfi_endproc + .align 16 + +