Add WIP x86_64 assembly texture mapping routines.

- Currently: only tvlineasm1 and tvlineasm2, but incomplete (no reverse
  translucency, nonpow2 textures will crash)
- For System V AMD64 calling conventions; requires YASM

git-svn-id: https://svn.eduke32.com/eduke32@4066 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
helixhorned 2013-09-21 13:37:31 +00:00
parent d5b6496ddb
commit eac9dd04bf
5 changed files with 272 additions and 7 deletions

View file

@ -146,6 +146,9 @@ USE_LIBVPX ?= 1
NETCODE ?= 1
LUNATIC ?= 0
# EXPERIMENTAL, unfinished x86_64 assembly routines. DO NOT ENABLE.
USE_ASM64 ?= 0
ifeq (0,$(USE_OPENGL))
POLYMER = 0
USE_LIBVPX = 0

View file

@ -52,6 +52,9 @@ ifeq (0,$(NOASM))
ENGINEOBJS+= $(OBJ)/a.$o
else
ENGINEOBJS+= $(OBJ)/a-c.$o
ifneq (0,$(USE_ASM64))
ENGINEOBJS+= $(OBJ)/a64.$o
endif
endif
@ -288,6 +291,11 @@ $(OBJ)/%.$o: $(SRC)/%.nasm
$(COMPILE_STATUS)
if $(AS) $(OURASFLAGS) $< -o $@; then $(COMPILE_OK); else $(COMPILE_FAILED); fi
# TODO: Makefile vars...
$(OBJ)/%.$o: $(SRC)/%.yasm
$(COMPILE_STATUS)
if yasm -f elf64 $< -o $@; then $(COMPILE_OK); else $(COMPILE_FAILED); fi
# Comment out the following rule to debug a-c.o
$(OBJ)/a-c.$o: $(SRC)/a-c.c
$(COMPILE_STATUS)

View file

@ -244,6 +244,9 @@ endif
ifneq (0,$(NOASM))
BUILDCOMMONFLAGS+= -DNOASM
endif
ifneq (0,$(USE_ASM64))
BUILDCOMMONFLAGS+= -DUSE_ASM64
endif
ifneq (0,$(LINKED_GTK))
BUILDCOMMONFLAGS+= -DLINKED_GTK
endif

View file

@ -19,16 +19,33 @@ extern intptr_t asm1, asm2, asm3, asm4;
extern int32_t fpuasm, globalx3, globaly3;
extern void *reciptable;
#ifdef USE_ASM64
# define A64_ASSIGN(var, val) var=val
#else
# define A64_ASSIGN(var, val)
#endif
#ifdef USE_ASM64
// variables for a64.yasm
int32_t a64_bpl, a64_transmode, a64_glogy;
intptr_t a64_paloffs;
char *a64_gtrans;
#endif
static int32_t bpl, transmode = 0;
static int32_t glogx, glogy, gbxinc, gbyinc, gpinc;
static char *gbuf, *gpal, *ghlinepal, *gtrans;
static char *gpal2;
//Global variable functions
void setvlinebpl(int32_t dabpl) { bpl = dabpl; }
void fixtransluscence(intptr_t datransoff) { gtrans = (char *)datransoff; }
void settransnormal(void) { transmode = 0; }
void settransreverse(void) { transmode = 1; }
void setvlinebpl(int32_t dabpl) { A64_ASSIGN(a64_bpl, dabpl); bpl = dabpl;}
void fixtransluscence(intptr_t datransoff)
{
A64_ASSIGN(a64_gtrans, (char *)datransoff);
gtrans = (char *)datransoff;
}
void settransnormal(void) { A64_ASSIGN(a64_transmode, 0); transmode = 0; }
void settransreverse(void) { A64_ASSIGN(a64_transmode, 1); transmode = 1; }
///// Ceiling/floor horizontal line functions /////
@ -230,8 +247,18 @@ void mvlineasm4(int32_t cnt, char *p)
Bmemcpy(vplce, vplc, sizeof(vplce));
}
#ifdef USE_ASM64
# define GLOGY a64_glogy
#else
# define GLOGY glogy
#endif
void setuptvlineasm(int32_t neglogy) { glogy = neglogy; }
void setuptvlineasm(int32_t neglogy)
{
GLOGY = neglogy;
}
#if !defined USE_ASM64
// cnt+1 loop iterations!
int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
{
@ -270,13 +297,17 @@ int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i
return vplc;
}
#endif
void setuptvlineasm2(int32_t neglogy, intptr_t paloffs1, intptr_t paloffs2)
{
glogy = neglogy;
GLOGY = neglogy;
A64_ASSIGN(a64_paloffs, paloffs1);
gpal = (char *)paloffs1;
gpal2 = (char *)paloffs2;
}
#if !defined USE_ASM64
// Pass: asm1=vinc2, asm2=pend
// Return: asm1=vplc1, asm2=vplc2
void tvlineasm2(uint32_t vplc2, int32_t vinc1, intptr_t bufplc1, intptr_t bufplc2, uint32_t vplc1, intptr_t p)
@ -330,7 +361,7 @@ void tvlineasm2(uint32_t vplc2, int32_t vinc1, intptr_t bufplc1, intptr_t bufplc
asm1 = vplc1;
asm2 = vplc2;
}
#endif
//Floor sprite horizontal line functions
void msethlineshift(int32_t logx, int32_t logy) { glogx = logx; glogy = logy; }

View file

@ -0,0 +1,220 @@
;;; x86_64 assembly for basic texture mapping functions, based on a.nasm.
;;; For use with System V AMD64 calling convention (rdi rsi rdx rcx r8 r9)
;;; See the included license file "BUILDLIC.TXT" for license info.
SECTION .text
%ifdef UNDERSCORES
%define asm1 _asm1
%define asm2 _asm2
%define a64_bpl _a64_bpl
%define a64_transmode _a64_transmode
%define a64_glogy _a64_glogy
%define a64_gtrans _a64_gtrans
%define a64_paloffs _a64_paloffs
%define tvlineasm1 _tvlineasm1
%define tvlineasm2 _tvlineasm2
%endif
;;; Imports
EXTERN asm1 ; intptr_t
EXTERN asm2 ; intptr_t
EXTERN a64_bpl ; int32_t
EXTERN a64_transmode ; int32_t
EXTERN a64_glogy ; int32_t
EXTERN a64_gtrans ; char *
EXTERN a64_paloffs ; intptr_t
;;; Exports
GLOBAL tvlineasm1 ; Masked & transluscent 1-pixel wide vline
GLOBAL tvlineasm2 ; Masked & transluscent 2-pixel wide vline
;;; ========== MACROS ==========
; Construct name referring to low doubleword of GPR, only for r8-r15.
%define REGd(reg) reg %+ d
; Construct name of whole GPR from word-sized register, only for ax-bp.
%define rREG(regw) r %+ regw
; Construct name of low doubleword of GPR from word-sized register, only for ax-bp.
%define eREG(regw) e %+ regw
;;; Multi-line macros for code shared between functions
;; Prologue getters
%macro PGET_rBpl 0
push rBpl
mov REGd(rBpl), dword [rel a64_bpl]
%endmacro
%macro PGET_rATransluc 0
push rATransluc
mov rATransluc, [rel a64_gtrans]
%endmacro
; Get right shift count 'glogy' into cl
%macro GET_cl 0
mov cl, byte [rel a64_glogy]
%endmacro
; Look up texel
; <dest_reg_lobyte> := bufplc[vplc>>glogy]
; High bytes are cleared to zero.
%macro gettexel 3 ; <dest_reg_word> <bufplc_reg> <vlpc_reg>
mov eREG(%1), %3
shr eREG(%1), cl
movzx eREG(%1), byte [%2 + rREG(%1)]
%endmacro
; Do palette/shade and translucency lookup
%macro dotranspal 4 ; <tmp_reg> <tmp_reg_lobyte> <texel_reg> <dst_ofs>
; Get palette index of the pixel in the frame buffer.
; NOTE: e.g. "mov ah, byte [reg]" is not encodeable.
movzx %1, byte [rDst + %4]
shl %1, 8
mov %2, byte [rAPalookup + %3] ; palette/shade
mov %2, byte [rATransluc + %1] ; translucency
%endmacro
;;; ========== TVLINEASM1 ==========
;;; TODO: transmode, nonpow2
;;; Registers used in the loop
%define rBpl r10
%define rATransluc r12
%define rVplc r11d
;;; Registers of input args also used in the loop
%define rVinc edi
%define rAPalookup rsi
%define rCnt edx
%define rABufplc r8
%define rDst r9
;;; int32_t tvlineasm1(vinc, *paloffs, cnt, vplc, *bufplc, *p)
;;; eax edi rsi edx ecx r8 r9
ALIGN 16
tvlineasm1:
inc rCnt
;; First, back up callee-saved registers and set up those used in the loop.
PGET_rBpl
mov rVplc, ecx
GET_cl
PGET_rATransluc
push rbx
jmp short tv1_loop
ALIGN 16
tv1_loop:
gettexel bx, rABufplc, rVplc
cmp bl, 255
je short tv1_skiptrans
dotranspal rax, al, rbx, 0
mov byte [rDst], al
tv1_skiptrans:
add rVplc, rVinc
add rDst, rBpl
dec rCnt
jnz short tv1_loop
mov eax, rVplc ; return vplc
pop rbx
pop rATransluc
pop rBpl
ret
;;; Undefine input arg macros
%undef rVinc
%undef rAPalookup
%undef rCnt
%undef rABufplc
%undef rDst
;;; Udefine rVplc register macro, but keep rBpl and rATransluc (r10 and r12).
;;; Take care not to clash with them in the following.
%undef rVplc
;;; ========== TVLINEASM2 ==========
;;; Registers used in the loop
%define rVinc2 r11d
%define rABufplc2 r13
%define rDstEnd r14
%define rAPalookup r15 ; TODO: second paloffs!
;;; Registers of input args also used in the loop
%define rVplc2 edi
%define rVinc1 esi
%define rABufplc1 rdx
; bufplc2: rcx -> r13
%define rVplc1 r8d
%define rDst r9
;;; Pass: asm1=vinc2, asm2=pend
;;; Return: asm1=vplc1, asm2=vplc2
;;; void tvlineasm2(vplc2, vinc1, *bufplc1, *bufplc2, vplc1, *p)
;;; edi esi rdx rcx r8d r9
ALIGN 16
tvlineasm2:
;; First, back up callee-saved registers and set up those used in the loop.
mov rVinc2, dword [rel asm1]
PGET_rBpl
push rABufplc2
mov rABufplc2, rcx
GET_cl
PGET_rATransluc
push rDstEnd
mov rDstEnd, [rel asm2]
dec rDstEnd
add rDstEnd, rBpl ; one more: compare with a-c.c's tvlineasm2()
push rbx
push rbp
push rAPalookup
mov rAPalookup, [rel a64_paloffs]
jmp short tv2_loop
ALIGN 16
tv2_loop:
gettexel bx, rABufplc1, rVplc1
gettexel bp, rABufplc2, rVplc2
mov eax, ebp
shl eax, 8
or eax, ebx
cmp eax, 0xffff
jz tv2_skiptrans
dotranspal rax, al, rbx, 0
dotranspal rbx, bl, rbp, 1
and rax, 0xff
shl rbx, 8
or rax, rbx
mov word [rDst], ax
tv2_skiptrans:
add rVplc1, rVinc1
add rVplc2, rVinc2
add rDst, rBpl
cmp rDst, rDstEnd
jnz tv2_loop
pop r15
pop rbp
pop rbx
pop rDstEnd
pop rATransluc
pop rABufplc2
pop rBpl
ret