raze/polymer/eduke32/build/src/a64.yasm
helixhorned eac9dd04bf Add WIP x86_64 assembly texture mapping routines.
- Currently: only tvlineasm1 and tvlineasm2, but incomplete (no reverse
  translucency, nonpow2 textures will crash)
- For System V AMD64 calling conventions; requires YASM

git-svn-id: https://svn.eduke32.com/eduke32@4066 1a8010ca-5511-0410-912e-c29ae57300e0
2013-09-21 13:37:31 +00:00

220 lines
4.6 KiB
Text

;;; x86_64 assembly for basic texture mapping functions, based on a.nasm.
;;; For use with System V AMD64 calling convention (rdi rsi rdx rcx r8 r9)
;;; See the included license file "BUILDLIC.TXT" for license info.
SECTION .text
%ifdef UNDERSCORES
%define asm1 _asm1
%define asm2 _asm2
%define a64_bpl _a64_bpl
%define a64_transmode _a64_transmode
%define a64_glogy _a64_glogy
%define a64_gtrans _a64_gtrans
%define a64_paloffs _a64_paloffs
%define tvlineasm1 _tvlineasm1
%define tvlineasm2 _tvlineasm2
%endif
;;; Imports
EXTERN asm1 ; intptr_t
EXTERN asm2 ; intptr_t
EXTERN a64_bpl ; int32_t
EXTERN a64_transmode ; int32_t
EXTERN a64_glogy ; int32_t
EXTERN a64_gtrans ; char *
EXTERN a64_paloffs ; intptr_t
;;; Exports
GLOBAL tvlineasm1 ; Masked & transluscent 1-pixel wide vline
GLOBAL tvlineasm2 ; Masked & transluscent 2-pixel wide vline
;;; ========== MACROS ==========
; Construct name referring to low doubleword of GPR, only for r8-r15.
%define REGd(reg) reg %+ d
; Construct name of whole GPR from word-sized register, only for ax-bp.
%define rREG(regw) r %+ regw
; Construct name of low doubleword of GPR from word-sized register, only for ax-bp.
%define eREG(regw) e %+ regw
;;; Multi-line macros for code shared between functions
;; Prologue getters
%macro PGET_rBpl 0
push rBpl
mov REGd(rBpl), dword [rel a64_bpl]
%endmacro
%macro PGET_rATransluc 0
push rATransluc
mov rATransluc, [rel a64_gtrans]
%endmacro
; Get right shift count 'glogy' into cl
%macro GET_cl 0
mov cl, byte [rel a64_glogy]
%endmacro
; Look up texel
; <dest_reg_lobyte> := bufplc[vplc>>glogy]
; High bytes are cleared to zero.
%macro gettexel 3 ; <dest_reg_word> <bufplc_reg> <vlpc_reg>
mov eREG(%1), %3
shr eREG(%1), cl
movzx eREG(%1), byte [%2 + rREG(%1)]
%endmacro
; Do palette/shade and translucency lookup
%macro dotranspal 4 ; <tmp_reg> <tmp_reg_lobyte> <texel_reg> <dst_ofs>
; Get palette index of the pixel in the frame buffer.
; NOTE: e.g. "mov ah, byte [reg]" is not encodeable.
movzx %1, byte [rDst + %4]
shl %1, 8
mov %2, byte [rAPalookup + %3] ; palette/shade
mov %2, byte [rATransluc + %1] ; translucency
%endmacro
;;; ========== TVLINEASM1 ==========
;;; TODO: transmode, nonpow2
;;; Registers used in the loop
%define rBpl r10
%define rATransluc r12
%define rVplc r11d
;;; Registers of input args also used in the loop
%define rVinc edi
%define rAPalookup rsi
%define rCnt edx
%define rABufplc r8
%define rDst r9
;;; int32_t tvlineasm1(vinc, *paloffs, cnt, vplc, *bufplc, *p)
;;; eax edi rsi edx ecx r8 r9
ALIGN 16
tvlineasm1:
inc rCnt
;; First, back up callee-saved registers and set up those used in the loop.
PGET_rBpl
mov rVplc, ecx
GET_cl
PGET_rATransluc
push rbx
jmp short tv1_loop
ALIGN 16
tv1_loop:
gettexel bx, rABufplc, rVplc
cmp bl, 255
je short tv1_skiptrans
dotranspal rax, al, rbx, 0
mov byte [rDst], al
tv1_skiptrans:
add rVplc, rVinc
add rDst, rBpl
dec rCnt
jnz short tv1_loop
mov eax, rVplc ; return vplc
pop rbx
pop rATransluc
pop rBpl
ret
;;; Undefine input arg macros
%undef rVinc
%undef rAPalookup
%undef rCnt
%undef rABufplc
%undef rDst
;;; Udefine rVplc register macro, but keep rBpl and rATransluc (r10 and r12).
;;; Take care not to clash with them in the following.
%undef rVplc
;;; ========== TVLINEASM2 ==========
;;; Registers used in the loop
%define rVinc2 r11d
%define rABufplc2 r13
%define rDstEnd r14
%define rAPalookup r15 ; TODO: second paloffs!
;;; Registers of input args also used in the loop
%define rVplc2 edi
%define rVinc1 esi
%define rABufplc1 rdx
; bufplc2: rcx -> r13
%define rVplc1 r8d
%define rDst r9
;;; Pass: asm1=vinc2, asm2=pend
;;; Return: asm1=vplc1, asm2=vplc2
;;; void tvlineasm2(vplc2, vinc1, *bufplc1, *bufplc2, vplc1, *p)
;;; edi esi rdx rcx r8d r9
ALIGN 16
tvlineasm2:
;; First, back up callee-saved registers and set up those used in the loop.
mov rVinc2, dword [rel asm1]
PGET_rBpl
push rABufplc2
mov rABufplc2, rcx
GET_cl
PGET_rATransluc
push rDstEnd
mov rDstEnd, [rel asm2]
dec rDstEnd
add rDstEnd, rBpl ; one more: compare with a-c.c's tvlineasm2()
push rbx
push rbp
push rAPalookup
mov rAPalookup, [rel a64_paloffs]
jmp short tv2_loop
ALIGN 16
tv2_loop:
gettexel bx, rABufplc1, rVplc1
gettexel bp, rABufplc2, rVplc2
mov eax, ebp
shl eax, 8
or eax, ebx
cmp eax, 0xffff
jz tv2_skiptrans
dotranspal rax, al, rbx, 0
dotranspal rbx, bl, rbp, 1
and rax, 0xff
shl rbx, 8
or rax, rbx
mov word [rDst], ax
tv2_skiptrans:
add rVplc1, rVinc1
add rVplc2, rVinc2
add rDst, rBpl
cmp rDst, rDstEnd
jnz tv2_loop
pop r15
pop rbp
pop rbx
pop rDstEnd
pop rATransluc
pop rABufplc2
pop rBpl
ret