- deleted rt_copy*col and rt_map*col assembly versions after running benchmarks that show inferior performance to the C++ versions on both older and newer CPUs.

This commit is contained in:
Christoph Oelckers 2016-12-05 00:46:58 +01:00
parent 86fcc3fd21
commit f6fb27b683
4 changed files with 11 additions and 454 deletions

View file

@ -597,427 +597,6 @@ dmsdone add esp,8
;*----------------------------------------------------------------------
;*
;* rt_copy1col_asm
;*
;* ecx = hx
;* edx = sx
;* [esp+4] = yl
;* [esp+8] = yh
;*
;*----------------------------------------------------------------------
GLOBAL @rt_copy1col_asm@16
GLOBAL _rt_copy1col_asm
GLOBAL rt_copy1col_asm
align 16
rt_copy1col_asm:
_rt_copy1col_asm:
pop eax
mov edx,[esp+4*3]
mov ecx,[esp+4*2]
push edx
push ecx
mov ecx,[esp+4*2]
mov edx,[esp+4*3]
push eax
@rt_copy1col_asm@16:
mov eax, [esp+4]
push ebx
mov ebx, [esp+12]
push esi
sub ebx, eax
push edi
js .done
lea esi,[eax*4]
inc ebx ; ebx = count
mov eax,edx
add ecx,esi
mov edi,[ylookup+esi]
add ecx,[dc_temp] ; ecx = source
mov esi,[dc_pitch] ; esi = pitch
add eax,edi ; eax = dest
add eax,[dc_destorg]
shr ebx,1
jnc .even
mov dl,[ecx]
add ecx,4
mov [eax],dl
add eax,esi
.even and ebx,ebx
jz .done
.loop mov dl,[ecx]
mov dh,[ecx+4]
mov [eax],dl
mov [eax+esi],dh
add ecx,8
lea eax,[eax+esi*2]
dec ebx
jnz .loop
.done pop edi
pop esi
pop ebx
ret 8
;*----------------------------------------------------------------------
;*
;* rt_copy4cols_asm
;*
;* ecx = sx
;* edx = yl
;* [esp+4] = yh
;*
;*----------------------------------------------------------------------
GLOBAL @rt_copy4cols_asm@12
GLOBAL _rt_copy4cols_asm
GLOBAL rt_copy4cols_asm
align 16
rt_copy4cols_asm:
_rt_copy4cols_asm:
pop eax
mov ecx,[esp+8]
mov edx,[esp+4]
push ecx
mov ecx,[esp+4]
push eax
@rt_copy4cols_asm@12:
push ebx
mov ebx,[esp+8]
push esi
sub ebx,edx
push edi
js .done
inc ebx ; ebx = count
mov eax,ecx
mov esi,[ylookup+edx*4]
mov ecx,[dc_temp]
add eax,esi ; eax = dest
add eax,[dc_destorg]
lea ecx,[ecx+edx*4] ; ecx = source
mov edx,[dc_pitch] ; edx = pitch
shr ebx,1
jnc .even
mov esi,[ecx]
add ecx,4
mov [eax],esi
add eax,edx
.even and ebx,ebx
jz .done
.loop mov esi,[ecx]
mov edi,[ecx+4]
mov [eax],esi
mov [eax+edx],edi
add ecx,8
lea eax,[eax+edx*2]
dec ebx
jnz .loop
.done pop edi
pop esi
pop ebx
ret 4
;*----------------------------------------------------------------------
;*
;* rt_map1col_asm
;*
;* ecx = hx
;* edx = sx
;* [esp+4] = yl
;* [esp+8] = yh
;*
;*----------------------------------------------------------------------
GLOBAL @rt_map1col_asm@16
GLOBAL _rt_map1col_asm
GLOBAL rt_map1col_asm
align 16
rt_map1col_asm:
_rt_map1col_asm:
pop eax
mov edx,[esp+4*3]
mov ecx,[esp+4*2]
push edx
push ecx
mov ecx,[esp+4*2]
mov edx,[esp+4*3]
push eax
@rt_map1col_asm@16:
mov eax,[esp+4]
push ebx
mov ebx,[esp+12]
push ebp
push esi
sub ebx, eax
push edi
js .done
lea edi,[eax*4]
mov esi,[dc_colormap] ; esi = colormap
inc ebx ; ebx = count
mov eax,edx
lea ebp,[ecx+edi] ; ebp = source
add ebp,[dc_temp]
mov ecx,[ylookup+edi]
mov edi,[dc_pitch] ; edi = pitch
add eax,ecx ; eax = dest
xor ecx,ecx
xor edx,edx
add eax,[dc_destorg]
shr ebx,1
jnc .even
mov dl,[ebp]
add ebp,4
mov dl,[esi+edx]
mov [eax],dl
add eax,edi
.even and ebx,ebx
jz .done
.loop mov dl,[ebp]
mov cl,[ebp+4]
add ebp,8
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax],dl
mov [eax+edi],cl
dec ebx
lea eax,[eax+edi*2]
jnz .loop
.done pop edi
pop esi
pop ebp
pop ebx
ret 8
;*----------------------------------------------------------------------
;*
;* rt_map4cols_asm
;*
;* rt_map4cols_asm1 is for PPro and above
;* rt_map4cols_asm2 is for Pentium and below
;*
;* ecx = sx
;* edx = yl
;* [esp+4] = yh
;*
;*----------------------------------------------------------------------
GLOBAL @rt_map4cols_asm1@12
GLOBAL _rt_map4cols_asm1
GLOBAL rt_map4cols_asm1
align 16
rt_map4cols_asm1:
_rt_map4cols_asm1:
pop eax
mov ecx,[esp+8]
mov edx,[esp+4]
push ecx
mov ecx,[esp+4]
push eax
@rt_map4cols_asm1@12:
push ebx
mov ebx,[esp+8]
push ebp
push esi
sub ebx,edx
push edi
js near .done
mov esi,[dc_colormap] ; esi = colormap
shl edx,2
mov eax,ecx
inc ebx ; ebx = count
mov edi,[ylookup+edx]
mov ebp,[dc_temp]
add ebp,edx ; ebp = source
add eax,edi ; eax = dest
mov edi,[dc_pitch] ; edi = pitch
add eax,[dc_destorg]
xor ecx,ecx
xor edx,edx
shr ebx,1
jnc .even
mov dl,[ebp]
mov cl,[ebp+1]
add ebp,4
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax],dl
mov [eax+1],cl
mov dl,[ebp-2]
mov cl,[ebp-1]
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax+2],dl
mov [eax+3],cl
add eax,edi
.even and ebx,ebx
jz .done
.loop:
mov dl,[ebp]
mov cl,[ebp+1]
add ebp,8
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax],dl
mov [eax+1],cl
mov dl,[ebp-6]
mov cl,[ebp-5]
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax+2],dl
mov [eax+3],cl
mov dl,[ebp-4]
mov cl,[ebp-3]
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax+edi],dl
mov [eax+edi+1],cl
mov dl,[ebp-2]
mov cl,[ebp-1]
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax+edi+2],dl
mov [eax+edi+3],cl
lea eax,[eax+edi*2]
dec ebx
jnz .loop
.done pop edi
pop esi
pop ebp
pop ebx
ret 4
GLOBAL @rt_map4cols_asm2@12
GLOBAL _rt_map4cols_asm2
GLOBAL rt_map4cols_asm2
align 16
rt_map4cols_asm2:
_rt_map4cols_asm2:
pop eax
mov ecx,[esp+8]
mov edx,[esp+4]
push ecx
mov ecx,[esp+4]
push eax
@rt_map4cols_asm2@12:
push ebx
mov ebx,[esp+8]
push ebp
push esi
sub ebx,edx
push edi
js near .done
mov esi,[dc_colormap] ; esi = colormap
shl edx,2
mov eax,ecx
inc ebx ; ebx = count
mov edi,[ylookup+edx]
mov ebp,[dc_temp]
add ebp,edx ; ebp = source
add eax,edi ; eax = dest
mov edi,[dc_pitch] ; edi = pitch
add eax,[dc_destorg]
xor ecx,ecx
xor edx,edx
shr ebx,1
jnc .even
mov dl,[ebp]
mov cl,[ebp+1]
add ebp,4
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax],dl
mov [eax+1],cl
mov dl,[ebp-2]
mov cl,[ebp-1]
mov dl,[esi+edx]
mov cl,[esi+ecx]
mov [eax+2],dl
mov [eax+3],cl
add eax,edi
.even and ebx,ebx
jz .done
.loop:
mov dl,[ebp+3]
mov ch,[esi+edx]
mov dl,[ebp+2]
mov cl,[esi+edx]
shl ecx,16
mov dl,[ebp+1]
mov ch,[esi+edx]
mov dl,[ebp]
mov cl,[esi+edx]
mov [eax],ecx
add eax,edi
mov dl,[ebp+7]
mov ch,[esi+edx]
mov dl,[ebp+6]
mov cl,[esi+edx]
shl ecx,16
mov dl,[ebp+5]
mov ch,[esi+edx]
mov dl,[ebp+4]
mov cl,[esi+edx]
mov [eax],ecx
add eax,edi
add ebp,8
dec ebx
jnz .loop
.done pop edi
pop esi
pop ebp
pop ebx
ret 4
align 16
GLOBAL rt_shaded4cols_asm
GLOBAL _rt_shaded4cols_asm

View file

@ -73,7 +73,6 @@ void (*R_DrawTranslatedColumn)(void);
void (*R_DrawShadedColumn)(void);
void (*R_DrawSpan)(void);
void (*R_DrawSpanMasked)(void);
void (*rt_map4cols)(int,int,int);
//
// R_DrawColumn
@ -2589,21 +2588,13 @@ void R_InitColumnDrawers ()
R_DrawShadedColumn = R_DrawShadedColumnP_C;
R_DrawSpan = R_DrawSpanP_ASM;
R_DrawSpanMasked = R_DrawSpanMaskedP_ASM;
if (CPU.Family <= 5)
{
rt_map4cols = rt_map4cols_asm2;
}
else
{
rt_map4cols = rt_map4cols_asm1;
}
#else
R_DrawColumnHoriz = R_DrawColumnHorizP_C;
R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C;
R_DrawShadedColumn = R_DrawShadedColumnP_C;
R_DrawSpan = R_DrawSpanP_C;
R_DrawSpanMasked = R_DrawSpanMaskedP_C;
rt_map4cols = rt_map4cols_c;
rt_map4cols = rt_map4cols;
#endif
}

View file

@ -120,16 +120,19 @@ extern void (*R_DrawColumnHoriz)(void);
void R_InitColumnDrawers ();
// [RH] Moves data from the temporary buffer to the screen.
void rt_copy1col(int hx, int sx, int yl, int yh);
void rt_copy4cols(int sx, int yl, int yh);
void rt_map4cols(int sx, int yl, int yh);
extern "C"
{
void rt_copy1col_c (int hx, int sx, int yl, int yh);
void rt_copy4cols_c (int sx, int yl, int yh);
void rt_shaded1col (int hx, int sx, int yl, int yh);
void rt_shaded4cols_c (int sx, int yl, int yh);
void rt_shaded4cols_asm (int sx, int yl, int yh);
void rt_map1col_c (int hx, int sx, int yl, int yh);
void rt_map1col (int hx, int sx, int yl, int yh);
void rt_add1col (int hx, int sx, int yl, int yh);
void rt_addclamp1col (int hx, int sx, int yl, int yh);
void rt_subclamp1col (int hx, int sx, int yl, int yh);
@ -141,7 +144,6 @@ void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh);
void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh);
void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh);
void rt_map4cols_c (int sx, int yl, int yh);
void rt_add4cols_c (int sx, int yl, int yh);
void rt_addclamp4cols_c (int sx, int yl, int yh);
void rt_subclamp4cols (int sx, int yl, int yh);
@ -153,29 +155,16 @@ void rt_tlateaddclamp4cols (int sx, int yl, int yh);
void rt_tlatesubclamp4cols (int sx, int yl, int yh);
void rt_tlaterevsubclamp4cols (int sx, int yl, int yh);
void rt_copy1col_asm (int hx, int sx, int yl, int yh);
void rt_map1col_asm (int hx, int sx, int yl, int yh);
void rt_copy4cols_asm (int sx, int yl, int yh);
void rt_map4cols_asm1 (int sx, int yl, int yh);
void rt_map4cols_asm2 (int sx, int yl, int yh);
void rt_add4cols_asm (int sx, int yl, int yh);
void rt_addclamp4cols_asm (int sx, int yl, int yh);
}
extern void (*rt_map4cols)(int sx, int yl, int yh);
#ifdef X86_ASM
#define rt_copy1col rt_copy1col_asm
#define rt_copy4cols rt_copy4cols_asm
#define rt_map1col rt_map1col_asm
#define rt_shaded4cols rt_shaded4cols_asm
#define rt_add4cols rt_add4cols_asm
#define rt_addclamp4cols rt_addclamp4cols_asm
#else
#define rt_copy1col rt_copy1col_c
#define rt_copy4cols rt_copy4cols_c
#define rt_map1col rt_map1col_c
#define rt_shaded4cols rt_shaded4cols_c
#define rt_add4cols rt_add4cols_c
#define rt_addclamp4cols rt_addclamp4cols_c

View file

@ -69,9 +69,8 @@ extern "C" void R_SetupAddCol();
extern "C" void R_SetupAddClampCol();
#endif
#ifndef X86_ASM
// Copies one span at hx to the screen at sx.
void rt_copy1col_c (int hx, int sx, int yl, int yh)
void rt_copy1col (int hx, int sx, int yl, int yh)
{
BYTE *source;
BYTE *dest;
@ -112,7 +111,7 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh)
}
// Copies all four spans to the screen starting at sx.
void rt_copy4cols_c (int sx, int yl, int yh)
void rt_copy4cols (int sx, int yl, int yh)
{
int *source;
int *dest;
@ -145,7 +144,7 @@ void rt_copy4cols_c (int sx, int yl, int yh)
}
// Maps one span at hx to the screen at sx.
void rt_map1col_c (int hx, int sx, int yl, int yh)
void rt_map1col (int hx, int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -180,7 +179,7 @@ void rt_map1col_c (int hx, int sx, int yl, int yh)
}
// Maps all four spans to the screen starting at sx.
void rt_map4cols_c (int sx, int yl, int yh)
void rt_map4cols (int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -222,7 +221,6 @@ void rt_map4cols_c (int sx, int yl, int yh)
dest += pitch*2;
} while (--count);
}
#endif
void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh)
{