gzdoom/src/asm_ia32/tmap3.asm

334 lines
6.9 KiB
NASM
Raw Normal View History

About a week's worth of changes here. As a heads-up, I wouldn't be surprised if this doesn't build in Linux right now. The CMakeLists.txt were checked with MinGW and NMake, but how they fair under Linux is an unknown to me at this time. - Converted most sprintf (and all wsprintf) calls to either mysnprintf or FStrings, depending on the situation. - Changed the strings in the wbstartstruct to be FStrings. - Changed myvsnprintf() to output nothing if count is greater than INT_MAX. This is so that I can use a series of mysnprintf() calls and advance the pointer for each one. Once the pointer goes beyond the end of the buffer, the count will go negative, but since it's an unsigned type it will be seen as excessively huge instead. This should not be a problem, as there's no reason for ZDoom to be using text buffers larger than 2 GB anywhere. - Ripped out the disabled bit from FGameConfigFile::MigrateOldConfig(). - Changed CalcMapName() to return an FString instead of a pointer to a static buffer. - Changed startmap in d_main.cpp into an FString. - Changed CheckWarpTransMap() to take an FString& as the first argument. - Changed d_mapname in g_level.cpp into an FString. - Changed DoSubstitution() in ct_chat.cpp to place the substitutions in an FString. - Fixed: The MAPINFO parser wrote into the string buffer to construct a map name when given a Hexen map number. This was fine with the old scanner code, but only a happy coincidence prevents it from crashing with the new code - Added the 'B' conversion specifier to StringFormat::VWorker() for printing binary numbers. - Added CMake support for building with MinGW, MSYS, and NMake. Linux support is probably broken until I get around to booting into Linux again. Niceties provided over the existing Makefiles they're replacing: * All command-line builds can use the same build system, rather than having a separate one for MinGW and another for Linux. * Microsoft's NMake tool is supported as a target. * Progress meters. * Parallel makes work from a fresh checkout without needing to be primed first with a single-threaded make. * Porting to other architectures should be simplified, whenever that day comes. - Replaced the makewad tool with zipdir. This handles the dependency tracking itself instead of generating an external makefile to do it, since I couldn't figure out how to generate a makefile with an external tool and include it with a CMake-generated makefile. Where makewad used a master list of files to generate the package file, zipdir just zips the entire contents of one or more directories. - Added the gdtoa package from netlib's fp library so that ZDoom's printf-style formatting can be entirely independant of the CRT. SVN r1082 (trunk)
2008-07-23 04:57:26 +00:00
%include "valgrind.inc"
%ifdef M_TARGET_WATCOM
SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32
SEGMENT DATA
%else
SECTION .data
%endif
%ifndef M_TARGET_LINUX
%define ylookup _ylookup
%define vplce _vplce
%define vince _vince
%define palookupoffse _palookupoffse
%define bufplce _bufplce
%define dc_iscale _dc_iscale
%define dc_colormap _dc_colormap
%define dc_count _dc_count
%define dc_dest _dc_dest
%define dc_source _dc_source
%define dc_texturefrac _dc_texturefrac
%define dc_pitch _dc_pitch
%define setupvlinetallasm _setupvlinetallasm
%define vlinetallasm4 _vlinetallasm4
%define vlinetallasmathlon4 _vlinetallasmathlon4
%define vlinetallasm1 _vlinetallasm1
%define prevlinetallasm1 _prevlinetallasm1
%endif
EXTERN vplce
EXTERN vince
EXTERN palookupoffse
EXTERN bufplce
EXTERN ylookup
EXTERN dc_iscale
EXTERN dc_colormap
EXTERN dc_count
EXTERN dc_dest
EXTERN dc_source
EXTERN dc_texturefrac
EXTERN dc_pitch
GLOBAL vlt4pitch
GLOBAL vlt1pitch
%ifdef M_TARGET_WATCOM
SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32
SEGMENT CODE
%else
SECTION .text
%endif
ALIGN 16
GLOBAL setpitch3
setpitch3:
mov [vltpitch+2], eax
mov [vltpitcha+2],eax
mov [vlt1pitch1+2], eax
mov [vlt1pitch2+2], eax
selfmod vltpitch, vlt1pitch2+6
ret
ALIGN 16
GLOBAL setupvlinetallasm
setupvlinetallasm:
mov ecx, [esp+4]
mov [shifter1+2], cl
mov [shifter2+2], cl
mov [shifter3+2], cl
mov [shifter4+2], cl
mov [shifter1a+2], cl
mov [shifter2a+2], cl
mov [shifter3a+2], cl
mov [shifter4a+2], cl
mov [preshift+2], cl
mov [shift11+2], cl
mov [shift12+2], cl
selfmod shifter1, shift12+6
ret
SECTION .rtext progbits alloc exec write align=64
ALIGN 16
GLOBAL vlinetallasm4
vlinetallasm4:
push ebx
mov eax, [bufplce+0]
mov ebx, [bufplce+4]
mov ecx, [bufplce+8]
mov edx, [bufplce+12]
mov [source1+3], eax
mov [source2+3], ebx
mov [source3+3], ecx
mov [source4+3], edx
mov eax, [palookupoffse+0]
mov ebx, [palookupoffse+4]
mov ecx, [palookupoffse+8]
mov edx, [palookupoffse+12]
mov [lookup1+2], eax
mov [lookup2+2], ebx
mov [lookup3+2], ecx
mov [lookup4+2], edx
mov eax, [vince+0]
mov ebx, [vince+4]
mov ecx, [vince+8]
mov edx, [vince+12]
mov [step1+2], eax
mov [step2+2], ebx
mov [step3+2], ecx
mov [step4+1], edx
push ebp
push esi
push edi
mov ecx, [dc_count]
mov edi, [dc_dest]
mov eax, dword [ylookup+ecx*4-4]
add eax, edi
sub edi, eax
mov [write1+2],eax
inc eax
mov [write2+2],eax
inc eax
mov [write3+2],eax
inc eax
mov [write4+2],eax
mov ebx, [vplce]
mov ecx, [vplce+4]
mov esi, [vplce+8]
mov eax, [vplce+12]
selfmod loopit, vltpitch
jmp loopit
ALIGN 16
loopit:
mov edx, ebx
shifter1: shr edx, 24
source1: movzx edx, BYTE [edx+0x88888888]
lookup1: mov dl, [edx+0x88888888]
write1: mov [edi+0x88888880], dl
step1: add ebx, 0x88888888
mov edx, ecx
shifter2: shr edx, 24
source2: movzx edx, BYTE [edx+0x88888888]
lookup2: mov dl, [edx+0x88888888]
write2: mov [edi+0x88888881], dl
step2: add ecx, 0x88888888
mov edx, esi
shifter3: shr edx, 24
source3: movzx edx, BYTE [edx+0x88888888]
lookup3: mov dl, BYTE [edx+0x88888888]
write3: mov [edi+0x88888882], dl
step3: add esi, 0x88888888
mov edx, eax
shifter4: shr edx, 24
source4: movzx edx, BYTE [edx+0x88888888]
lookup4: mov dl, [edx+0x88888888]
write4: mov [edi+0x88888883], dl
step4: add eax, 0x88888888
vltpitch: add edi, 320
jle near loopit
mov [vplce], ebx
mov [vplce+4], ecx
mov [vplce+8], esi
mov [vplce+12], eax
pop edi
pop esi
pop ebp
pop ebx
ret
ALIGN 16
GLOBAL vlinetallasmathlon4
vlinetallasmathlon4:
push ebx
mov eax, [bufplce+0]
mov ebx, [bufplce+4]
mov ecx, [bufplce+8]
mov edx, [bufplce+12]
mov [source1a+3], eax
mov [source2a+3], ebx
mov [source3a+3], ecx
mov [source4a+3], edx
mov eax, [palookupoffse+0]
mov ebx, [palookupoffse+4]
mov ecx, [palookupoffse+8]
mov edx, [palookupoffse+12]
mov [lookup1a+2], eax
mov [lookup2a+2], ebx
mov [lookup3a+2], ecx
mov [lookup4a+2], edx
mov eax, [vince+0]
mov ebx, [vince+4]
mov ecx, [vince+8]
mov edx, [vince+12]
mov [step1a+2], eax
mov [step2a+2], ebx
mov [step3a+2], ecx
mov [step4a+1], edx
push ebp
push esi
push edi
mov ecx, [dc_count]
mov edi, [dc_dest]
mov eax, dword [ylookup+ecx*4-4]
add eax, edi
sub edi, eax
mov [write1a+2],eax
inc eax
mov [write2a+2],eax
inc eax
mov [write3a+2],eax
inc eax
mov [write4a+2],eax
mov ebp, [vplce]
mov ecx, [vplce+4]
mov esi, [vplce+8]
mov eax, [vplce+12]
selfmod loopita, vltpitcha
jmp loopita
; Unfortunately, this code has not been carefully analyzed to determine
; how well it utilizes the processor's instruction units. Instead, I just
; kept rearranging code, seeing what sped it up and what slowed it down
; until I arrived at this. The is the fastest version I was able to
; manage, but that does not mean it cannot be made faster with careful
; instructing shuffling.
ALIGN 64
loopita: mov edx, ebp
mov ebx, ecx
shifter1a: shr edx, 24
shifter2a: shr ebx, 24
source1a: movzx edx, BYTE [edx+0x88888888]
source2a: movzx ebx, BYTE [ebx+0x88888888]
step1a: add ebp, 0x88888888
step2a: add ecx, 0x88888888
lookup1a: mov dl, [edx+0x88888888]
lookup2a: mov dh, [ebx+0x88888888]
mov ebx, esi
write1a: mov [edi+0x88888880], dl
write2a: mov [edi+0x88888881], dh
shifter3a: shr ebx, 24
mov edx, eax
source3a: movzx ebx, BYTE [ebx+0x88888888]
shifter4a: shr edx, 24
step3a: add esi, 0x88888888
source4a: movzx edx, BYTE [edx+0x88888888]
step4a: add eax, 0x88888888
lookup3a: mov bl, [ebx+0x88888888]
lookup4a: mov dl, [edx+0x88888888]
write3a: mov [edi+0x88888882], bl
write4a: mov [edi+0x88888883], dl
vltpitcha: add edi, 320
jle near loopita
mov [vplce], ebp
mov [vplce+4], ecx
mov [vplce+8], esi
mov [vplce+12], eax
pop edi
pop esi
pop ebp
pop ebx
ret
ALIGN 16
GLOBAL prevlinetallasm1
prevlinetallasm1:
mov ecx, [dc_count]
cmp ecx, 1
ja vlinetallasm1
mov eax, [dc_iscale]
mov edx, [dc_texturefrac]
add eax, edx
mov ecx, [dc_source]
preshift: shr edx, 16
push ebx
push edi
mov edi, [dc_colormap]
movzx ebx, byte [ecx+edx]
mov ecx, [dc_dest]
mov bl, byte [edi+ebx]
pop edi
mov byte [ecx], bl
pop ebx
ret
ALIGN 16
GLOBAL vlinetallasm1
vlinetallasm1:
push ebp
push ebx
push edi
push esi
mov ebp, [dc_count]
mov ebx, [dc_texturefrac] ; ebx = frac
mov edi, [dc_dest]
mov ecx, ebx
shift11: shr ecx, 16
mov esi, [dc_source]
mov edx, [dc_iscale]
vlt1pitch1: sub edi, 0x88888888
mov eax, [dc_colormap]
loop2:
movzx ecx, BYTE [esi+ecx]
add ebx, edx
vlt1pitch2: add edi, 0x88888888
mov cl,[eax+ecx]
mov [edi],cl
mov ecx,ebx
shift12: shr ecx,16
dec ebp
jnz loop2
mov eax,ebx
pop esi
pop edi
pop ebx
pop ebp
ret