From f6fb27b6835f586a381d86e6d212ba7626546a7d Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Mon, 5 Dec 2016 00:46:58 +0100 Subject: [PATCH] - deleted rt_copy*col and rt_map*col assembly versions after running benchmarks that show inferior performance to the C++ versions on both older and newer CPUs. --- src/asm_ia32/tmap.asm | 421 ------------------------------------------ src/r_draw.cpp | 11 +- src/r_draw.h | 23 +-- src/r_drawt.cpp | 10 +- 4 files changed, 11 insertions(+), 454 deletions(-) diff --git a/src/asm_ia32/tmap.asm b/src/asm_ia32/tmap.asm index cbfadd5d1..2096b9222 100644 --- a/src/asm_ia32/tmap.asm +++ b/src/asm_ia32/tmap.asm @@ -597,427 +597,6 @@ dmsdone add esp,8 -;*---------------------------------------------------------------------- -;* -;* rt_copy1col_asm -;* -;* ecx = hx -;* edx = sx -;* [esp+4] = yl -;* [esp+8] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_copy1col_asm@16 -GLOBAL _rt_copy1col_asm -GLOBAL rt_copy1col_asm - - align 16 - -rt_copy1col_asm: -_rt_copy1col_asm: - pop eax - mov edx,[esp+4*3] - mov ecx,[esp+4*2] - push edx - push ecx - mov ecx,[esp+4*2] - mov edx,[esp+4*3] - push eax - -@rt_copy1col_asm@16: - mov eax, [esp+4] - push ebx - mov ebx, [esp+12] - push esi - sub ebx, eax - push edi - js .done - - lea esi,[eax*4] - inc ebx ; ebx = count - mov eax,edx - add ecx,esi - mov edi,[ylookup+esi] - add ecx,[dc_temp] ; ecx = source - mov esi,[dc_pitch] ; esi = pitch - add eax,edi ; eax = dest - add eax,[dc_destorg] - - shr ebx,1 - jnc .even - - mov dl,[ecx] - add ecx,4 - mov [eax],dl - add eax,esi - -.even and ebx,ebx - jz .done - -.loop mov dl,[ecx] - mov dh,[ecx+4] - mov [eax],dl - mov [eax+esi],dh - add ecx,8 - lea eax,[eax+esi*2] - dec ebx - jnz .loop - -.done pop edi - pop esi - pop ebx - ret 8 - -;*---------------------------------------------------------------------- -;* -;* rt_copy4cols_asm -;* -;* ecx = sx -;* edx = yl -;* [esp+4] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_copy4cols_asm@12 -GLOBAL _rt_copy4cols_asm -GLOBAL rt_copy4cols_asm - - align 16 - -rt_copy4cols_asm: -_rt_copy4cols_asm: - pop eax - mov ecx,[esp+8] - mov edx,[esp+4] - push ecx - mov ecx,[esp+4] - push eax - -@rt_copy4cols_asm@12: - push ebx - mov ebx,[esp+8] - push esi - sub ebx,edx - push edi - js .done - - inc ebx ; ebx = count - mov eax,ecx - mov esi,[ylookup+edx*4] - mov ecx,[dc_temp] - add eax,esi ; eax = dest - add eax,[dc_destorg] - lea ecx,[ecx+edx*4] ; ecx = source - mov edx,[dc_pitch] ; edx = pitch - - shr ebx,1 - jnc .even - - mov esi,[ecx] - add ecx,4 - mov [eax],esi - add eax,edx - -.even and ebx,ebx - jz .done - -.loop mov esi,[ecx] - mov edi,[ecx+4] - mov [eax],esi - mov [eax+edx],edi - add ecx,8 - lea eax,[eax+edx*2] - dec ebx - jnz .loop - -.done pop edi - pop esi - pop ebx - ret 4 - -;*---------------------------------------------------------------------- -;* -;* rt_map1col_asm -;* -;* ecx = hx -;* edx = sx -;* [esp+4] = yl -;* [esp+8] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_map1col_asm@16 -GLOBAL _rt_map1col_asm -GLOBAL rt_map1col_asm - - align 16 - -rt_map1col_asm: -_rt_map1col_asm: - pop eax - mov edx,[esp+4*3] - mov ecx,[esp+4*2] - push edx - push ecx - mov ecx,[esp+4*2] - mov edx,[esp+4*3] - push eax - -@rt_map1col_asm@16: - mov eax,[esp+4] - push ebx - mov ebx,[esp+12] - push ebp - push esi - sub ebx, eax - push edi - js .done - - lea edi,[eax*4] - mov esi,[dc_colormap] ; esi = colormap - inc ebx ; ebx = count - mov eax,edx - lea ebp,[ecx+edi] ; ebp = source - add ebp,[dc_temp] - mov ecx,[ylookup+edi] - mov edi,[dc_pitch] ; edi = pitch - add eax,ecx ; eax = dest - xor ecx,ecx - xor edx,edx - add eax,[dc_destorg] - - shr ebx,1 - jnc .even - - mov dl,[ebp] - add ebp,4 - mov dl,[esi+edx] - mov [eax],dl - add eax,edi - -.even and ebx,ebx - jz .done - -.loop mov dl,[ebp] - mov cl,[ebp+4] - add ebp,8 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+edi],cl - dec ebx - lea eax,[eax+edi*2] - jnz .loop - -.done pop edi - pop esi - pop ebp - pop ebx - ret 8 - -;*---------------------------------------------------------------------- -;* -;* rt_map4cols_asm -;* -;* rt_map4cols_asm1 is for PPro and above -;* rt_map4cols_asm2 is for Pentium and below -;* -;* ecx = sx -;* edx = yl -;* [esp+4] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_map4cols_asm1@12 -GLOBAL _rt_map4cols_asm1 -GLOBAL rt_map4cols_asm1 - - align 16 - -rt_map4cols_asm1: -_rt_map4cols_asm1: - pop eax - mov ecx,[esp+8] - mov edx,[esp+4] - push ecx - mov ecx,[esp+4] - push eax - -@rt_map4cols_asm1@12: - push ebx - mov ebx,[esp+8] - push ebp - push esi - sub ebx,edx - push edi - js near .done - - mov esi,[dc_colormap] ; esi = colormap - shl edx,2 - mov eax,ecx - inc ebx ; ebx = count - mov edi,[ylookup+edx] - mov ebp,[dc_temp] - add ebp,edx ; ebp = source - add eax,edi ; eax = dest - mov edi,[dc_pitch] ; edi = pitch - add eax,[dc_destorg] - xor ecx,ecx - xor edx,edx - - shr ebx,1 - jnc .even - - mov dl,[ebp] - mov cl,[ebp+1] - add ebp,4 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+1],cl - mov dl,[ebp-2] - mov cl,[ebp-1] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+2],dl - mov [eax+3],cl - add eax,edi - -.even and ebx,ebx - jz .done - -.loop: - mov dl,[ebp] - mov cl,[ebp+1] - add ebp,8 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+1],cl - mov dl,[ebp-6] - mov cl,[ebp-5] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+2],dl - mov [eax+3],cl - mov dl,[ebp-4] - mov cl,[ebp-3] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+edi],dl - mov [eax+edi+1],cl - mov dl,[ebp-2] - mov cl,[ebp-1] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+edi+2],dl - mov [eax+edi+3],cl - lea eax,[eax+edi*2] - dec ebx - - jnz .loop - -.done pop edi - pop esi - pop ebp - pop ebx - ret 4 - -GLOBAL @rt_map4cols_asm2@12 -GLOBAL _rt_map4cols_asm2 -GLOBAL rt_map4cols_asm2 - - align 16 - -rt_map4cols_asm2: -_rt_map4cols_asm2: - pop eax - mov ecx,[esp+8] - mov edx,[esp+4] - push ecx - mov ecx,[esp+4] - push eax - -@rt_map4cols_asm2@12: - push ebx - mov ebx,[esp+8] - push ebp - push esi - sub ebx,edx - push edi - js near .done - - mov esi,[dc_colormap] ; esi = colormap - shl edx,2 - mov eax,ecx - inc ebx ; ebx = count - mov edi,[ylookup+edx] - mov ebp,[dc_temp] - add ebp,edx ; ebp = source - add eax,edi ; eax = dest - mov edi,[dc_pitch] ; edi = pitch - add eax,[dc_destorg] - xor ecx,ecx - xor edx,edx - - shr ebx,1 - jnc .even - - mov dl,[ebp] - mov cl,[ebp+1] - add ebp,4 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+1],cl - mov dl,[ebp-2] - mov cl,[ebp-1] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+2],dl - mov [eax+3],cl - add eax,edi - -.even and ebx,ebx - jz .done - -.loop: - mov dl,[ebp+3] - mov ch,[esi+edx] - mov dl,[ebp+2] - mov cl,[esi+edx] - shl ecx,16 - mov dl,[ebp+1] - mov ch,[esi+edx] - mov dl,[ebp] - mov cl,[esi+edx] - mov [eax],ecx - add eax,edi - - mov dl,[ebp+7] - mov ch,[esi+edx] - mov dl,[ebp+6] - mov cl,[esi+edx] - shl ecx,16 - mov dl,[ebp+5] - mov ch,[esi+edx] - mov dl,[ebp+4] - mov cl,[esi+edx] - mov [eax],ecx - add eax,edi - add ebp,8 - dec ebx - - jnz .loop - -.done pop edi - pop esi - pop ebp - pop ebx - ret 4 - - align 16 GLOBAL rt_shaded4cols_asm GLOBAL _rt_shaded4cols_asm diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 0e217c2d7..7e966c8ab 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -73,7 +73,6 @@ void (*R_DrawTranslatedColumn)(void); void (*R_DrawShadedColumn)(void); void (*R_DrawSpan)(void); void (*R_DrawSpanMasked)(void); -void (*rt_map4cols)(int,int,int); // // R_DrawColumn @@ -2589,21 +2588,13 @@ void R_InitColumnDrawers () R_DrawShadedColumn = R_DrawShadedColumnP_C; R_DrawSpan = R_DrawSpanP_ASM; R_DrawSpanMasked = R_DrawSpanMaskedP_ASM; - if (CPU.Family <= 5) - { - rt_map4cols = rt_map4cols_asm2; - } - else - { - rt_map4cols = rt_map4cols_asm1; - } #else R_DrawColumnHoriz = R_DrawColumnHorizP_C; R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; R_DrawShadedColumn = R_DrawShadedColumnP_C; R_DrawSpan = R_DrawSpanP_C; R_DrawSpanMasked = R_DrawSpanMaskedP_C; - rt_map4cols = rt_map4cols_c; + rt_map4cols = rt_map4cols; #endif } diff --git a/src/r_draw.h b/src/r_draw.h index 8b1c8ffbc..6713d4091 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -120,16 +120,19 @@ extern void (*R_DrawColumnHoriz)(void); void R_InitColumnDrawers (); // [RH] Moves data from the temporary buffer to the screen. + +void rt_copy1col(int hx, int sx, int yl, int yh); +void rt_copy4cols(int sx, int yl, int yh); +void rt_map4cols(int sx, int yl, int yh); + extern "C" { -void rt_copy1col_c (int hx, int sx, int yl, int yh); -void rt_copy4cols_c (int sx, int yl, int yh); void rt_shaded1col (int hx, int sx, int yl, int yh); void rt_shaded4cols_c (int sx, int yl, int yh); void rt_shaded4cols_asm (int sx, int yl, int yh); -void rt_map1col_c (int hx, int sx, int yl, int yh); +void rt_map1col (int hx, int sx, int yl, int yh); void rt_add1col (int hx, int sx, int yl, int yh); void rt_addclamp1col (int hx, int sx, int yl, int yh); void rt_subclamp1col (int hx, int sx, int yl, int yh); @@ -141,7 +144,6 @@ void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh); -void rt_map4cols_c (int sx, int yl, int yh); void rt_add4cols_c (int sx, int yl, int yh); void rt_addclamp4cols_c (int sx, int yl, int yh); void rt_subclamp4cols (int sx, int yl, int yh); @@ -153,29 +155,16 @@ void rt_tlateaddclamp4cols (int sx, int yl, int yh); void rt_tlatesubclamp4cols (int sx, int yl, int yh); void rt_tlaterevsubclamp4cols (int sx, int yl, int yh); -void rt_copy1col_asm (int hx, int sx, int yl, int yh); -void rt_map1col_asm (int hx, int sx, int yl, int yh); - -void rt_copy4cols_asm (int sx, int yl, int yh); -void rt_map4cols_asm1 (int sx, int yl, int yh); -void rt_map4cols_asm2 (int sx, int yl, int yh); void rt_add4cols_asm (int sx, int yl, int yh); void rt_addclamp4cols_asm (int sx, int yl, int yh); } -extern void (*rt_map4cols)(int sx, int yl, int yh); #ifdef X86_ASM -#define rt_copy1col rt_copy1col_asm -#define rt_copy4cols rt_copy4cols_asm -#define rt_map1col rt_map1col_asm #define rt_shaded4cols rt_shaded4cols_asm #define rt_add4cols rt_add4cols_asm #define rt_addclamp4cols rt_addclamp4cols_asm #else -#define rt_copy1col rt_copy1col_c -#define rt_copy4cols rt_copy4cols_c -#define rt_map1col rt_map1col_c #define rt_shaded4cols rt_shaded4cols_c #define rt_add4cols rt_add4cols_c #define rt_addclamp4cols rt_addclamp4cols_c diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp index cb228cce0..a4f581d12 100644 --- a/src/r_drawt.cpp +++ b/src/r_drawt.cpp @@ -69,9 +69,8 @@ extern "C" void R_SetupAddCol(); extern "C" void R_SetupAddClampCol(); #endif -#ifndef X86_ASM // Copies one span at hx to the screen at sx. -void rt_copy1col_c (int hx, int sx, int yl, int yh) +void rt_copy1col (int hx, int sx, int yl, int yh) { BYTE *source; BYTE *dest; @@ -112,7 +111,7 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh) } // Copies all four spans to the screen starting at sx. -void rt_copy4cols_c (int sx, int yl, int yh) +void rt_copy4cols (int sx, int yl, int yh) { int *source; int *dest; @@ -145,7 +144,7 @@ void rt_copy4cols_c (int sx, int yl, int yh) } // Maps one span at hx to the screen at sx. -void rt_map1col_c (int hx, int sx, int yl, int yh) +void rt_map1col (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -180,7 +179,7 @@ void rt_map1col_c (int hx, int sx, int yl, int yh) } // Maps all four spans to the screen starting at sx. -void rt_map4cols_c (int sx, int yl, int yh) +void rt_map4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -222,7 +221,6 @@ void rt_map4cols_c (int sx, int yl, int yh) dest += pitch*2; } while (--count); } -#endif void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh) {