diff --git a/specs/udmf_zdoom.txt b/specs/udmf_zdoom.txt index 9662af1bb..835f3780e 100644 --- a/specs/udmf_zdoom.txt +++ b/specs/udmf_zdoom.txt @@ -252,9 +252,10 @@ Note: All fields default to false unless mentioned otherwise. // negative values are used as their absolute. Default = 1. renderstyle = ; // Set per-actor render style, overriding the class default. Possible values can be "normal", - // "none", "add" or "additive", "subtract" or "subtractive", "stencil", "translucentstencil", - // "translucent", "fuzzy", "optfuzzy", "soultrans". Default is an empty string for no change. - fillcolor = ; // Fill color used by the "stencil" and "translucentstencil" rendestyles, as RRGGBB value, default = 0x000000. + // "none", "add" or "additive", "subtract" or "subtractive", "stencil", "translucentstencil", + // "addstencil", "shaded", "addshaded", "translucent", "fuzzy", "optfuzzy", "soultrans" and "shadow". + // Default is an empty string for no change. + fillcolor = ; // Fill color used by the "stencil", "addstencil" and "translucentstencil" rendestyles, as RRGGBB value, default = 0x000000. alpha = ; // Translucency of this actor (if applicable to renderstyle), default is 1.0. score = ; // Score value of this actor, overriding the class default if not null. Default = 0. pitch = ; // Pitch of thing in degrees. Default = 0 (horizontal). diff --git a/src/asm_ia32/tmap.asm b/src/asm_ia32/tmap.asm index fb372d488..2096b9222 100644 --- a/src/asm_ia32/tmap.asm +++ b/src/asm_ia32/tmap.asm @@ -598,894 +598,6 @@ dmsdone add esp,8 -;*---------------------------------------------------------------------- -;* -;* R_DrawColumnP -;* -;*---------------------------------------------------------------------- - -GLOBAL @R_DrawColumnP_ASM@0 -GLOBAL _R_DrawColumnP_ASM -GLOBAL R_DrawColumnP_ASM - - align 16 - -R_DrawColumnP_ASM: -_R_DrawColumnP_ASM: -@R_DrawColumnP_ASM@0: - -; count = dc_yh - dc_yl; - - mov ecx,[dc_count] - test ecx,ecx - jle near rdcpret ; count <= 0: nothing to do, so leave - - push ebp ; save registers - push ebx - push edi - push esi - -; dest = ylookup[dc_yl] + dc_x + dc_destorg; - - mov edi,[dc_dest] - mov ebp,ecx - mov ebx,[dc_texturefrac] ; ebx = frac -rdcp1: sub edi,SPACEFILLER4 - mov ecx,ebx - shr ecx,16 - mov esi,[dc_source] - mov edx,[dc_iscale] - mov eax,[dc_colormap] - - cmp BYTE [CPU+66],byte 5 - jg rdcploop2 - - align 16 - -; The registers should now look like this: -; -; [31 .. 16][15 .. 8][7 .. 0] -; eax [colormap ] -; ebx [yi ][yf ] -; ecx [scratch ] -; edx [dyi ][dyf ] -; esi [source texture column ] -; edi [destination screen pointer ] -; ebp [counter ] -; - - -; Note the partial register stalls on anything better than a Pentium -; That's why there are two versions of this loop. - -rdcploop: - mov cl,[esi+ecx] ; Fetch texel - xor ch,ch - add ebx,edx ; increment frac -rdcp2: add edi,SPACEFILLER4 ; increment destination pointer - mov cl,[eax+ecx] ; colormap texel - mov [edi],cl ; Store texel - mov ecx,ebx - shr ecx,16 - dec ebp - jnz rdcploop ; loop - - pop esi - pop edi - pop ebx - pop ebp -rdcpret: - ret - - align 16 - -rdcploop2: - movzx ecx,byte [esi+ecx] ; Fetch texel - add ebx,edx ; increment frac - mov cl,[eax+ecx] ; colormap texel -rdcp3: add edi,SPACEFILLER4 ; increment destination pointer - mov [edi],cl ; Store texel - mov ecx,ebx - shr ecx,16 - dec ebp - jnz rdcploop2 ; loop - - pop esi - pop edi - pop ebx - pop ebp - ret - - - -;*---------------------------------------------------------------------- -;* -;* R_DrawFuzzColumnP -;* -;*---------------------------------------------------------------------- - -GLOBAL @R_DrawFuzzColumnP_ASM@0 -GLOBAL _R_DrawFuzzColumnP_ASM -GLOBAL R_DrawFuzzColumnP_ASM - - align 16 - -R_DrawFuzzColumnP_ASM: -_R_DrawFuzzColumnP_ASM: -@R_DrawFuzzColumnP_ASM@0: - -; Adjust borders. Low... - mov eax,[dc_yl] - push ebx - push esi - push edi - push ebp - - cmp eax,0 - jg .ylok - - mov eax,1 - nop - -; ...and high. -.ylok mov edx,[fuzzviewheight] - mov esi,[dc_yh] - cmp esi,edx - jle .yhok - - mov esi,edx - nop - -.yhok mov edx,[dc_x] - sub esi,eax ; esi = count - js near .dfcdone ; Zero length (or less) - - mov edi,[ylookup+eax*4] - mov ebx,edx - add edi,[dc_destorg] - mov eax,[NormalLight] - mov ecx,[fuzzpos] - add edi,ebx - add eax,256*6 - inc esi - mov ebp,[dc_pitch] - mov edx,FUZZTABLE - test ecx,ecx - je .fuzz0 - -; -; esi = count -; edi = dest -; ecx = fuzzpos -; eax = colormap 6 -; - -; first loop: end with fuzzpos or count 0, whichever happens first - - sub edx,ecx ; edx = # of entries left in fuzzoffset - mov ebx,esi - cmp esi,edx - jle .enuf - mov esi,edx -.enuf sub ebx,esi - mov edx,[fuzzoffset+ecx*4] - push ebx - xor ebx,ebx - -.loop1 inc ecx - mov bl,[edi+edx] - dec esi - mov bl,[eax+ebx] - mov [edi],bl - lea edi,[edi+ebp] - mov edx,[fuzzoffset+ecx*4] - jnz .loop1 - -; second loop: Chunk it into groups of FUZZTABLE-sized spans and do those - - pop esi - cmp ecx,FUZZTABLE - jl .savefuzzpos - xor ecx,ecx - nop -.fuzz0 cmp esi,FUZZTABLE - jl .chunked - -.oloop lea edx,[esi-FUZZTABLE] - mov esi,FUZZTABLE - push edx - mov edx,[fuzzoffset+ecx*4] - -.iloop inc ecx - mov bl,[edi+edx] - dec esi - mov bl,[eax+ebx] - mov [edi],bl - lea edi,[edi+ebp] - mov edx,[fuzzoffset+ecx*4] - jnz .iloop - - pop esi - xor ecx,ecx - cmp esi,FUZZTABLE - jge .oloop - -; third loop: Do whatever is left - -.chunked: - test esi,esi - jle .savefuzzpos - mov edx,[fuzzoffset+ecx*4] - nop - -.loop3 inc ecx - mov bl,[edi+edx] - dec esi - mov bl,[eax+ebx] - mov [edi],bl - lea edi,[edi+ebp] - mov edx,[fuzzoffset+ecx*4] - jnz .loop3 - -.savefuzzpos: - mov [fuzzpos],ecx -.dfcdone: - pop ebp - pop edi - pop esi - pop ebx - ret - - -;*---------------------------------------------------------------------- -;* -;* R_DrawColumnHorizP_ASM -;* -;*---------------------------------------------------------------------- - -GLOBAL @R_DrawColumnHorizP_ASM@0 -GLOBAL _R_DrawColumnHorizP_ASM -GLOBAL R_DrawColumnHorizP_ASM - - align 16 - -@R_DrawColumnHorizP_ASM@0: -_R_DrawColumnHorizP_ASM: -R_DrawColumnHorizP_ASM: - -; count = dc_yh - dc_yl; - - mov eax,[dc_yh] - mov ecx,[dc_yl] - sub eax,ecx - mov edx,[dc_x] - - jl near .leave ; count < 0: nothing to do, so leave - - push ebp ; save registers - push ebx - push edi - push esi - - inc eax ; make 0 count mean 0 pixels - and edx,3 - push eax - mov eax,[dc_temp] - mov esi,[dc_ctspan+edx*4] - add eax,edx - lea eax,[eax+ecx*4] ; eax = top of column in buffer - mov ebp,[dc_yh] - mov [esi],ecx - mov [esi+4],ebp - add esi,8 - mov edi,[dc_source] - mov [dc_ctspan+edx*4],esi - mov esi,[dc_iscale] - mov ecx,[dc_texturefrac] ; ecx = frac - mov dl,[edi] ; load cache - mov ebx,[esp] - and ebx,0xfffffff8 - jnz .mthan8 - -; Register usage in the following code is: -; -; eax: dest -; edi: source -; ecx: frac (16.16) -; esi: fracstep (16.16) -; ebx: add1 -; ebp: add2 -; dl: texel1 -; dh: texel2 -;[esp] count - -; there are fewer than 8 pixels to draw - - mov ebx,[esp] -.lthan8 shr ebx,1 - jnc .even - -; do one pixel before loop (little opportunity for pairing) - - mov ebp,ecx ; copy frac to ebx - add ecx,esi ; increment frac - shr ebp,16 ; shift frac over to low end - add eax,4 - mov dl,[edi+ebp] - mov [eax-4],dl - -.even test ebx,ebx - jz near .done - -.loop2 mov [esp],ebx ; save counter - mov ebx,ecx ; copy frac for texel1 to ebx - shr ebx,16 ; shift frac for texel1 to low end - add ecx,esi ; increment frac - mov ebp,ecx ; copy frac for texel2 to ebp - shr ebp,16 ; shift frac for texel2 to low end - add ecx,esi ; increment frac - mov dl,[edi+ebx] ; read texel1 - mov ebx,[esp] ; fetch counter - mov dh,[edi+ebp] ; read texel2 - mov [eax],dl ; write texel1 - mov [eax+4],dh ; write texel2 - add eax,8 ; increment dest - dec ebx ; decrement counter - jnz .loop2 ; loop until it hits 0 - - jmp .done - -; there are more than 8 pixels to draw. position eax as close to a 32 byte -; boundary as possible, then do whatever is left. - -.mthan8 test eax,4 - jz .try2 - - mov ebp,ecx ; frac: in ebp - add ecx,esi ; step - shr ebp,16 ; frac: shift - add eax,4 ; increment dest - mov ebx,[esp] ; fetch counter - mov dl,[edi+ebp] ; tex: read - dec ebx ; decrement counter - mov [eax-4],dl ; tex: write - mov [esp],ebx ; store counter - -.try2 test eax,8 - jz .try4 - - mov ebx,ecx ; frac1: in ebx - add ecx,esi ; step - shr ebx,16 ; frac1: shift - mov ebp,ecx ; frac2: in ebp - shr ebp,16 ; frac2: shift - add ecx,esi ; step - mov dl,[edi+ebx] ; tex1: read - mov ebx,[esp] ; fetch counter - mov dh,[edi+ebp] ; tex2: read - mov [eax],dl ; tex1: write - mov [eax+4],dh ; tex2: write - sub ebx,2 ; decrement counter - add eax,8 ; increment dest - mov [esp],ebx ; store counter - -.try4 test eax,16 - jz .try8 - - mov ebx,ecx ; frac1: in ebx - add ecx,esi ; step - shr ebx,16 ; frac1: shift - mov ebp,ecx ; frac2: in ebp - shr ebp,16 ; frac2: shift - add ecx,esi ; step - mov dl,[edi+ebx] ; tex1: read - mov ebx,ecx ; frac3: in ebx - shr ebx,16 ; frac3: shift - mov dh,[edi+ebp] ; tex2: read - add ecx,esi ; step - mov [eax],dl ; tex1: write - mov [eax+4],dh ; tex2: write - mov ebp,ecx ; frac4: in ebp - shr ebp,16 ; frac4: shift - add ecx,esi ; step - mov dl,[edi+ebx] ; tex3: read - mov ebx,[esp] ; fetch counter - mov dh,[edi+ebp] ; tex4: read - sub ebx,4 ; decrement counter - mov [esp],ebx ; store counter - mov [eax+8],dl ; tex3: write - mov [eax+12],dh ; tex4: write - add eax,16 ; increment dest - -.try8 mov ebx,[esp] ; make counter count groups of 8 - sub esp,4 - shr ebx,3 - jmp .tail8 - - align 16 - -.loop8 mov [esp],ebx ; save counter - mov ebx,ecx ; frac1: in ebx - shr ebx,16 ; frac1: shift - add ecx,esi ; step - mov ebp,ecx ; frac2: in ebp - shr ebp,16 ; frac2: shift - add ecx,esi ; step - mov dl,[edi+ebx] ; tex1: read - mov ebx,ecx ; frac3: in ebx - mov dh,[edi+ebp] ; tex2: read - shr ebx,16 ; frac3: shift - add ecx,esi ; step - mov [eax],dl ; tex1: write - mov [eax+4],dh ; tex2: write - mov ebp,ecx ; frac4: in ebp - shr ebp,16 ; frac4: shift - add ecx,esi ; step - mov dl,[edi+ebx] ; tex3: read - mov ebx,ecx ; frac5: in ebx - mov dh,[edi+ebp] ; tex4: read - shr ebx,16 ; frac5: shift - mov [eax+8],dl ; tex3: write - mov [eax+12],dh ; tex4: write - add ecx,esi ; step - mov ebp,ecx ; frac6: in ebp - shr ebp,16 ; frac6: shift - mov dl,[edi+ebx] ; tex5: read - add ecx,esi ; step - mov ebx,ecx ; frac7: in ebx - mov [eax+16],dl ; tex5: write - shr ebx,16 ; frac7: shift - mov dh,[edi+ebp] ; tex6: read - add ecx,esi ; step - mov ebp,ecx ; frac8: in ebp - mov [eax+20],dh ; tex6: write - shr ebp,16 ; frac8: shift - add eax,32 ; increment dest pointer - mov dl,[edi+ebx] ; tex7: read - mov ebx,[esp] ; fetch counter - mov [eax-8],dl ; tex7: write - mov dh,[edi+ebp] ; tex8: read - add ecx,esi ; step - mov [eax-4],dh ; tex8: write - mov dl,[eax] ; load cache - dec ebx ; decrement counter -.tail8 jnz near .loop8 ; loop if more to do - - pop ebp - mov ebx,[esp] - and ebx,7 - jnz near .lthan8 - -.done pop eax - pop esi - pop edi - pop ebx - pop ebp -.leave ret - - -;*---------------------------------------------------------------------- -;* -;* rt_copy1col_asm -;* -;* ecx = hx -;* edx = sx -;* [esp+4] = yl -;* [esp+8] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_copy1col_asm@16 -GLOBAL _rt_copy1col_asm -GLOBAL rt_copy1col_asm - - align 16 - -rt_copy1col_asm: -_rt_copy1col_asm: - pop eax - mov edx,[esp+4*3] - mov ecx,[esp+4*2] - push edx - push ecx - mov ecx,[esp+4*2] - mov edx,[esp+4*3] - push eax - -@rt_copy1col_asm@16: - mov eax, [esp+4] - push ebx - mov ebx, [esp+12] - push esi - sub ebx, eax - push edi - js .done - - lea esi,[eax*4] - inc ebx ; ebx = count - mov eax,edx - add ecx,esi - mov edi,[ylookup+esi] - add ecx,[dc_temp] ; ecx = source - mov esi,[dc_pitch] ; esi = pitch - add eax,edi ; eax = dest - add eax,[dc_destorg] - - shr ebx,1 - jnc .even - - mov dl,[ecx] - add ecx,4 - mov [eax],dl - add eax,esi - -.even and ebx,ebx - jz .done - -.loop mov dl,[ecx] - mov dh,[ecx+4] - mov [eax],dl - mov [eax+esi],dh - add ecx,8 - lea eax,[eax+esi*2] - dec ebx - jnz .loop - -.done pop edi - pop esi - pop ebx - ret 8 - -;*---------------------------------------------------------------------- -;* -;* rt_copy4cols_asm -;* -;* ecx = sx -;* edx = yl -;* [esp+4] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_copy4cols_asm@12 -GLOBAL _rt_copy4cols_asm -GLOBAL rt_copy4cols_asm - - align 16 - -rt_copy4cols_asm: -_rt_copy4cols_asm: - pop eax - mov ecx,[esp+8] - mov edx,[esp+4] - push ecx - mov ecx,[esp+4] - push eax - -@rt_copy4cols_asm@12: - push ebx - mov ebx,[esp+8] - push esi - sub ebx,edx - push edi - js .done - - inc ebx ; ebx = count - mov eax,ecx - mov esi,[ylookup+edx*4] - mov ecx,[dc_temp] - add eax,esi ; eax = dest - add eax,[dc_destorg] - lea ecx,[ecx+edx*4] ; ecx = source - mov edx,[dc_pitch] ; edx = pitch - - shr ebx,1 - jnc .even - - mov esi,[ecx] - add ecx,4 - mov [eax],esi - add eax,edx - -.even and ebx,ebx - jz .done - -.loop mov esi,[ecx] - mov edi,[ecx+4] - mov [eax],esi - mov [eax+edx],edi - add ecx,8 - lea eax,[eax+edx*2] - dec ebx - jnz .loop - -.done pop edi - pop esi - pop ebx - ret 4 - -;*---------------------------------------------------------------------- -;* -;* rt_map1col_asm -;* -;* ecx = hx -;* edx = sx -;* [esp+4] = yl -;* [esp+8] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_map1col_asm@16 -GLOBAL _rt_map1col_asm -GLOBAL rt_map1col_asm - - align 16 - -rt_map1col_asm: -_rt_map1col_asm: - pop eax - mov edx,[esp+4*3] - mov ecx,[esp+4*2] - push edx - push ecx - mov ecx,[esp+4*2] - mov edx,[esp+4*3] - push eax - -@rt_map1col_asm@16: - mov eax,[esp+4] - push ebx - mov ebx,[esp+12] - push ebp - push esi - sub ebx, eax - push edi - js .done - - lea edi,[eax*4] - mov esi,[dc_colormap] ; esi = colormap - inc ebx ; ebx = count - mov eax,edx - lea ebp,[ecx+edi] ; ebp = source - add ebp,[dc_temp] - mov ecx,[ylookup+edi] - mov edi,[dc_pitch] ; edi = pitch - add eax,ecx ; eax = dest - xor ecx,ecx - xor edx,edx - add eax,[dc_destorg] - - shr ebx,1 - jnc .even - - mov dl,[ebp] - add ebp,4 - mov dl,[esi+edx] - mov [eax],dl - add eax,edi - -.even and ebx,ebx - jz .done - -.loop mov dl,[ebp] - mov cl,[ebp+4] - add ebp,8 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+edi],cl - dec ebx - lea eax,[eax+edi*2] - jnz .loop - -.done pop edi - pop esi - pop ebp - pop ebx - ret 8 - -;*---------------------------------------------------------------------- -;* -;* rt_map4cols_asm -;* -;* rt_map4cols_asm1 is for PPro and above -;* rt_map4cols_asm2 is for Pentium and below -;* -;* ecx = sx -;* edx = yl -;* [esp+4] = yh -;* -;*---------------------------------------------------------------------- - -GLOBAL @rt_map4cols_asm1@12 -GLOBAL _rt_map4cols_asm1 -GLOBAL rt_map4cols_asm1 - - align 16 - -rt_map4cols_asm1: -_rt_map4cols_asm1: - pop eax - mov ecx,[esp+8] - mov edx,[esp+4] - push ecx - mov ecx,[esp+4] - push eax - -@rt_map4cols_asm1@12: - push ebx - mov ebx,[esp+8] - push ebp - push esi - sub ebx,edx - push edi - js near .done - - mov esi,[dc_colormap] ; esi = colormap - shl edx,2 - mov eax,ecx - inc ebx ; ebx = count - mov edi,[ylookup+edx] - mov ebp,[dc_temp] - add ebp,edx ; ebp = source - add eax,edi ; eax = dest - mov edi,[dc_pitch] ; edi = pitch - add eax,[dc_destorg] - xor ecx,ecx - xor edx,edx - - shr ebx,1 - jnc .even - - mov dl,[ebp] - mov cl,[ebp+1] - add ebp,4 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+1],cl - mov dl,[ebp-2] - mov cl,[ebp-1] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+2],dl - mov [eax+3],cl - add eax,edi - -.even and ebx,ebx - jz .done - -.loop: - mov dl,[ebp] - mov cl,[ebp+1] - add ebp,8 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+1],cl - mov dl,[ebp-6] - mov cl,[ebp-5] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+2],dl - mov [eax+3],cl - mov dl,[ebp-4] - mov cl,[ebp-3] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+edi],dl - mov [eax+edi+1],cl - mov dl,[ebp-2] - mov cl,[ebp-1] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+edi+2],dl - mov [eax+edi+3],cl - lea eax,[eax+edi*2] - dec ebx - - jnz .loop - -.done pop edi - pop esi - pop ebp - pop ebx - ret 4 - -GLOBAL @rt_map4cols_asm2@12 -GLOBAL _rt_map4cols_asm2 -GLOBAL rt_map4cols_asm2 - - align 16 - -rt_map4cols_asm2: -_rt_map4cols_asm2: - pop eax - mov ecx,[esp+8] - mov edx,[esp+4] - push ecx - mov ecx,[esp+4] - push eax - -@rt_map4cols_asm2@12: - push ebx - mov ebx,[esp+8] - push ebp - push esi - sub ebx,edx - push edi - js near .done - - mov esi,[dc_colormap] ; esi = colormap - shl edx,2 - mov eax,ecx - inc ebx ; ebx = count - mov edi,[ylookup+edx] - mov ebp,[dc_temp] - add ebp,edx ; ebp = source - add eax,edi ; eax = dest - mov edi,[dc_pitch] ; edi = pitch - add eax,[dc_destorg] - xor ecx,ecx - xor edx,edx - - shr ebx,1 - jnc .even - - mov dl,[ebp] - mov cl,[ebp+1] - add ebp,4 - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax],dl - mov [eax+1],cl - mov dl,[ebp-2] - mov cl,[ebp-1] - mov dl,[esi+edx] - mov cl,[esi+ecx] - mov [eax+2],dl - mov [eax+3],cl - add eax,edi - -.even and ebx,ebx - jz .done - -.loop: - mov dl,[ebp+3] - mov ch,[esi+edx] - mov dl,[ebp+2] - mov cl,[esi+edx] - shl ecx,16 - mov dl,[ebp+1] - mov ch,[esi+edx] - mov dl,[ebp] - mov cl,[esi+edx] - mov [eax],ecx - add eax,edi - - mov dl,[ebp+7] - mov ch,[esi+edx] - mov dl,[ebp+6] - mov cl,[esi+edx] - shl ecx,16 - mov dl,[ebp+5] - mov ch,[esi+edx] - mov dl,[ebp+4] - mov cl,[esi+edx] - mov [eax],ecx - add eax,edi - add ebp,8 - dec ebx - - jnz .loop - -.done pop edi - pop esi - pop ebp - pop ebx - ret 4 - - align 16 - GLOBAL rt_shaded4cols_asm GLOBAL _rt_shaded4cols_asm @@ -1875,9 +987,6 @@ ASM_PatchPitch: _ASM_PatchPitch: @ASM_PatchPitch@0: mov eax,[dc_pitch] - mov [rdcp1+2],eax - mov [rdcp2+2],eax - mov [rdcp3+2],eax mov [s4p+1],eax mov [a4p+1],eax mov [ac4p+1],eax diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 0f3d04884..6f58ec2a3 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -69,17 +69,10 @@ int scaledviewwidth; // These get changed depending on the current // screen depth and asm/no asm. void (*R_DrawColumnHoriz)(void); -void (*R_DrawColumn)(void); -void (*R_DrawFuzzColumn)(void); void (*R_DrawTranslatedColumn)(void); void (*R_DrawShadedColumn)(void); void (*R_DrawSpan)(void); void (*R_DrawSpanMasked)(void); -void (*R_DrawSpanTranslucent)(void); -void (*R_DrawSpanMaskedTranslucent)(void); -void (*R_DrawSpanAddClamp)(void); -void (*R_DrawSpanMaskedAddClamp)(void); -void (*rt_map4cols)(int,int,int); // // R_DrawColumn @@ -171,7 +164,6 @@ void R_InitShadeMaps() /* */ /************************************/ -#ifndef X86_ASM // // A column is a vertical slice/span from a wall texture that, // given the DOOM style restrictions on the view orientation, @@ -179,7 +171,7 @@ void R_InitShadeMaps() // Thus a special case loop for very fast rendering can // be used. It has also been used with Wolfenstein 3D. // -void R_DrawColumnP_C (void) +void R_DrawColumn (void) { int count; BYTE* dest; @@ -222,7 +214,7 @@ void R_DrawColumnP_C (void) } while (--count); } } -#endif + // [RH] Just fills a column with a color void R_FillColumnP (void) @@ -414,13 +406,12 @@ void R_InitFuzzTable (int fuzzoff) } } -#ifndef X86_ASM // // Creates a fuzzy image by copying pixels from adjacent ones above and below. // Used with an all black colormap, this could create the SHADOW effect, // i.e. spectres and invisible players. // -void R_DrawFuzzColumnP_C (void) +void R_DrawFuzzColumn (void) { int count; BYTE *dest; @@ -490,7 +481,6 @@ void R_DrawFuzzColumnP_C (void) fuzzpos = fuzz; } } -#endif // // R_DrawTranlucentColumn @@ -1046,7 +1036,7 @@ void R_SetupSpanBits(FTexture *tex) { ds_xbits--; } - if ((1 << ds_ybits) > tex->GetHeight()) + if ((1 << ds_ybits) > tex->GetHeight()) { ds_ybits--; } @@ -1057,7 +1047,7 @@ void R_SetupSpanBits(FTexture *tex) // // Draws the actual span. -#ifndef X86_ASM +//#ifndef X86_ASM void R_DrawSpanP_C (void) { dsfixed_t xfrac; @@ -1156,7 +1146,7 @@ void R_DrawSpanMaskedP_C (void) // 64x64 is the most common case by far, so special case it. do { - BYTE texdata; + int texdata; spot = ((xfrac>>(32-6-6))&(63*64)) + (yfrac>>(32-6)); texdata = source[spot]; @@ -1176,7 +1166,7 @@ void R_DrawSpanMaskedP_C (void) int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { - BYTE texdata; + int texdata; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); texdata = source[spot]; @@ -1190,9 +1180,9 @@ void R_DrawSpanMaskedP_C (void) } while (--count); } } -#endif +//#endif -void R_DrawSpanTranslucentP_C (void) +void R_DrawSpanTranslucent (void) { dsfixed_t xfrac; dsfixed_t yfrac; @@ -1252,7 +1242,7 @@ void R_DrawSpanTranslucentP_C (void) } } -void R_DrawSpanMaskedTranslucentP_C (void) +void R_DrawSpanMaskedTranslucent (void) { dsfixed_t xfrac; dsfixed_t yfrac; @@ -1326,7 +1316,7 @@ void R_DrawSpanMaskedTranslucentP_C (void) } } -void R_DrawSpanAddClampP_C (void) +void R_DrawSpanAddClamp (void) { dsfixed_t xfrac; dsfixed_t yfrac; @@ -1392,7 +1382,7 @@ void R_DrawSpanAddClampP_C (void) } } -void R_DrawSpanMaskedAddClampP_C (void) +void R_DrawSpanMaskedAddClamp (void) { dsfixed_t xfrac; dsfixed_t yfrac; @@ -1682,6 +1672,7 @@ DWORD vlinec1 () return frac; } +#ifndef _M_X64 void vlinec4 () { BYTE *dest = dc_dest; @@ -1698,6 +1689,43 @@ void vlinec4 () dest += dc_pitch; } while (--count); } +#else +// Optimized version for 64 bit. In 64 bit mode, accessing global variables is very expensive so even though +// this exceeds the register count, loading all those values into a local variable is faster than not loading all of them. +void vlinec4() +{ + BYTE *dest = dc_dest; + int count = dc_count; + int bits = vlinebits; + DWORD place; + auto pal0 = palookupoffse[0]; + auto pal1 = palookupoffse[1]; + auto pal2 = palookupoffse[2]; + auto pal3 = palookupoffse[3]; + auto buf0 = bufplce[0]; + auto buf1 = bufplce[1]; + auto buf2 = bufplce[2]; + auto buf3 = bufplce[3]; + const auto vince0 = vince[0]; + const auto vince1 = vince[1]; + const auto vince2 = vince[2]; + const auto vince3 = vince[3]; + auto vplce0 = vplce[0]; + auto vplce1 = vplce[1]; + auto vplce2 = vplce[2]; + auto vplce3 = vplce[3]; + + do + { + dest[0] = pal0[buf0[(place = vplce0) >> bits]]; vplce0 = place + vince0; + dest[1] = pal1[buf1[(place = vplce1) >> bits]]; vplce1 = place + vince1; + dest[2] = pal2[buf2[(place = vplce2) >> bits]]; vplce2 = place + vince2; + dest[3] = pal3[buf3[(place = vplce3) >> bits]]; vplce3 = place + vince3; + dest += dc_pitch; + } while (--count); +} +#endif + #endif void setupmvline (int fracbits) @@ -2555,35 +2583,18 @@ const BYTE *R_GetColumn (FTexture *tex, int col) void R_InitColumnDrawers () { #ifdef X86_ASM - R_DrawColumn = R_DrawColumnP_ASM; - R_DrawColumnHoriz = R_DrawColumnHorizP_ASM; - R_DrawFuzzColumn = R_DrawFuzzColumnP_ASM; + R_DrawColumnHoriz = R_DrawColumnHorizP_C; R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; R_DrawShadedColumn = R_DrawShadedColumnP_C; R_DrawSpan = R_DrawSpanP_ASM; R_DrawSpanMasked = R_DrawSpanMaskedP_ASM; - if (CPU.Family <= 5) - { - rt_map4cols = rt_map4cols_asm2; - } - else - { - rt_map4cols = rt_map4cols_asm1; - } #else R_DrawColumnHoriz = R_DrawColumnHorizP_C; - R_DrawColumn = R_DrawColumnP_C; - R_DrawFuzzColumn = R_DrawFuzzColumnP_C; R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; R_DrawShadedColumn = R_DrawShadedColumnP_C; R_DrawSpan = R_DrawSpanP_C; R_DrawSpanMasked = R_DrawSpanMaskedP_C; - rt_map4cols = rt_map4cols_c; #endif - R_DrawSpanTranslucent = R_DrawSpanTranslucentP_C; - R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_C; - R_DrawSpanAddClamp = R_DrawSpanAddClampP_C; - R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClampP_C; } // [RH] Choose column drawers in a single place diff --git a/src/r_draw.h b/src/r_draw.h index fa84e5ae9..6713d4091 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -65,7 +65,6 @@ extern "C" unsigned int horizspans[4]; // The span blitting interface. // Hook in assembler or system specific BLT here. -extern void (*R_DrawColumn)(void); extern DWORD (*dovline1) (); extern DWORD (*doprevline1) (); @@ -84,7 +83,7 @@ extern void setupmvline (int); extern void setuptmvline (int); // The Spectre/Invisibility effect. -extern void (*R_DrawFuzzColumn)(void); +extern void R_DrawFuzzColumn(void); // [RH] Draw shaded column extern void (*R_DrawShadedColumn)(void); @@ -103,16 +102,16 @@ void R_SetSpanSource(const BYTE *pixels); extern void (*R_DrawSpanMasked)(void); // Span drawing for translucent textures. -extern void (*R_DrawSpanTranslucent)(void); +void R_DrawSpanTranslucent(void); // Span drawing for masked, translucent textures. -extern void (*R_DrawSpanMaskedTranslucent)(void); +void R_DrawSpanMaskedTranslucent(void); // Span drawing for translucent, additive textures. -extern void (*R_DrawSpanAddClamp)(void); +void R_DrawSpanAddClamp(void); // Span drawing for masked, translucent, additive textures. -extern void (*R_DrawSpanMaskedAddClamp)(void); +void R_DrawSpanMaskedAddClamp(void); // [RH] Span blit into an interleaved intermediate buffer extern void (*R_DrawColumnHoriz)(void); @@ -121,16 +120,19 @@ extern void (*R_DrawColumnHoriz)(void); void R_InitColumnDrawers (); // [RH] Moves data from the temporary buffer to the screen. + +void rt_copy1col(int hx, int sx, int yl, int yh); +void rt_copy4cols(int sx, int yl, int yh); +void rt_map4cols(int sx, int yl, int yh); + extern "C" { -void rt_copy1col_c (int hx, int sx, int yl, int yh); -void rt_copy4cols_c (int sx, int yl, int yh); void rt_shaded1col (int hx, int sx, int yl, int yh); void rt_shaded4cols_c (int sx, int yl, int yh); void rt_shaded4cols_asm (int sx, int yl, int yh); -void rt_map1col_c (int hx, int sx, int yl, int yh); +void rt_map1col (int hx, int sx, int yl, int yh); void rt_add1col (int hx, int sx, int yl, int yh); void rt_addclamp1col (int hx, int sx, int yl, int yh); void rt_subclamp1col (int hx, int sx, int yl, int yh); @@ -142,7 +144,6 @@ void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh); -void rt_map4cols_c (int sx, int yl, int yh); void rt_add4cols_c (int sx, int yl, int yh); void rt_addclamp4cols_c (int sx, int yl, int yh); void rt_subclamp4cols (int sx, int yl, int yh); @@ -154,29 +155,16 @@ void rt_tlateaddclamp4cols (int sx, int yl, int yh); void rt_tlatesubclamp4cols (int sx, int yl, int yh); void rt_tlaterevsubclamp4cols (int sx, int yl, int yh); -void rt_copy1col_asm (int hx, int sx, int yl, int yh); -void rt_map1col_asm (int hx, int sx, int yl, int yh); - -void rt_copy4cols_asm (int sx, int yl, int yh); -void rt_map4cols_asm1 (int sx, int yl, int yh); -void rt_map4cols_asm2 (int sx, int yl, int yh); void rt_add4cols_asm (int sx, int yl, int yh); void rt_addclamp4cols_asm (int sx, int yl, int yh); } -extern void (*rt_map4cols)(int sx, int yl, int yh); #ifdef X86_ASM -#define rt_copy1col rt_copy1col_asm -#define rt_copy4cols rt_copy4cols_asm -#define rt_map1col rt_map1col_asm #define rt_shaded4cols rt_shaded4cols_asm #define rt_add4cols rt_add4cols_asm #define rt_addclamp4cols rt_addclamp4cols_asm #else -#define rt_copy1col rt_copy1col_c -#define rt_copy4cols rt_copy4cols_c -#define rt_map1col rt_map1col_c #define rt_shaded4cols rt_shaded4cols_c #define rt_add4cols rt_add4cols_c #define rt_addclamp4cols rt_addclamp4cols_c @@ -193,29 +181,25 @@ void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip); #ifdef X86_ASM -extern "C" void R_DrawColumnP_Unrolled (void); -extern "C" void R_DrawColumnHorizP_ASM (void); -extern "C" void R_DrawColumnP_ASM (void); -extern "C" void R_DrawFuzzColumnP_ASM (void); - void R_DrawTranslatedColumnP_C (void); void R_DrawShadedColumnP_C (void); extern "C" void R_DrawSpanP_ASM (void); extern "C" void R_DrawSpanMaskedP_ASM (void); +void R_DrawColumnHorizP_C(void); + #else -void R_DrawColumnHorizP_C (void); -void R_DrawColumnP_C (void); -void R_DrawFuzzColumnP_C (void); -void R_DrawTranslatedColumnP_C (void); void R_DrawShadedColumnP_C (void); void R_DrawSpanP_C (void); void R_DrawSpanMaskedP_C (void); #endif -void R_DrawSpanTranslucentP_C (void); -void R_DrawSpanMaskedTranslucentP_C (void); +void R_DrawColumn(); +void R_DrawColumnHorizP_C(void); +void R_DrawTranslatedColumnP_C(void); +void R_DrawSpanTranslucent (void); +void R_DrawSpanMaskedTranslucent (void); void R_DrawTlatedLucentColumnP_C (void); #define R_DrawTlatedLucentColumn R_DrawTlatedLucentColumnP_C diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp index cb228cce0..a4f581d12 100644 --- a/src/r_drawt.cpp +++ b/src/r_drawt.cpp @@ -69,9 +69,8 @@ extern "C" void R_SetupAddCol(); extern "C" void R_SetupAddClampCol(); #endif -#ifndef X86_ASM // Copies one span at hx to the screen at sx. -void rt_copy1col_c (int hx, int sx, int yl, int yh) +void rt_copy1col (int hx, int sx, int yl, int yh) { BYTE *source; BYTE *dest; @@ -112,7 +111,7 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh) } // Copies all four spans to the screen starting at sx. -void rt_copy4cols_c (int sx, int yl, int yh) +void rt_copy4cols (int sx, int yl, int yh) { int *source; int *dest; @@ -145,7 +144,7 @@ void rt_copy4cols_c (int sx, int yl, int yh) } // Maps one span at hx to the screen at sx. -void rt_map1col_c (int hx, int sx, int yl, int yh) +void rt_map1col (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -180,7 +179,7 @@ void rt_map1col_c (int hx, int sx, int yl, int yh) } // Maps all four spans to the screen starting at sx. -void rt_map4cols_c (int sx, int yl, int yh) +void rt_map4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -222,7 +221,6 @@ void rt_map4cols_c (int sx, int yl, int yh) dest += pitch*2; } while (--count); } -#endif void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh) {