- Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures

taller than 256 pixels. There was a very slight performance hit for this,
  but I was able to tweak mvlineasm4 to make it approximately as fast as
  before. Interestingly, maskwallscan manages to be nearly as fast as
  wallscan despite having to check every pixel for transparency. I'm
  tempted to dump all the old masked rendering code and use
  (trans)maskwallscan for everything for the sake of simplicity: Only
  two functions to maintain for each render style, and much less
  complicated supporting code. Currently, I need five different functions
  for each rendering style: One traditional column-at-a-time style like
  Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan.
  (Right now, I have cheated, and just done the ones that can be used
  by walls for transmaskwallscan, so the actual number of different functions
  isn't quite so high.) For small textures, such as font characters and
  far-away sprites, I'm sure maskwallscan is faster than the current code.
  For large textures, it's probably still competitive even if it isn't faster.
  But considering how similar wallscan and maskwallscan perform, the
  difference is probably pretty minimal, and maskwallscan still might come
  out ahead due to its simpler overhead.


SVN r105 (trunk)
This commit is contained in:
Randy Heit 2006-05-11 01:15:15 +00:00
parent ee12c25f47
commit 0069ca4072
2 changed files with 39 additions and 19 deletions

View file

@ -1,3 +1,24 @@
May 10, 2006
- Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures
taller than 256 pixels. There was a very slight performance hit for this,
but I was able to tweak mvlineasm4 to make it approximately as fast as
before. Interestingly, maskwallscan manages to be nearly as fast as
wallscan despite having to check every pixel for transparency. I'm
tempted to dump all the old masked rendering code and use
(trans)maskwallscan for everything for the sake of simplicity: Only
two functions to maintain for each render style, and much less
complicated supporting code. Currently, I need five different functions
for each rendering style: One traditional column-at-a-time style like
Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan.
(Right now, I have cheated, and just done the ones that can be used
by walls for transmaskwallscan, so the actual number of different functions
isn't quite so high.) For small textures, such as font characters and
far-away sprites, I'm sure maskwallscan is faster than the current code.
For large textures, it's probably still competitive even if it isn't faster.
But considering how similar wallscan and maskwallscan perform, the
difference is probably pretty minimal, and maskwallscan still might come
out ahead due to its simpler overhead.
May 10, 2006 (Changes by Graf Zahl)
- Fixed: PClass::CreateNew didn't check whether the class had valid
defaults and tried to copy data from a NULL pointer.

View file

@ -341,8 +341,8 @@ mvlineasm1:
beginmvline:
mov ebx, edx
maskmach3a: shr ebx, 32
mov bl, byte [esi+ebx]
cmp bl, 0
movzx ebx, byte [esi+ebx]
cmp ebx, 0
je short skipmask1
maskmach3c: mov bl, byte [ebp+ebx]
mov [edi], bl
@ -371,12 +371,12 @@ mvlineasm4:
mov eax, [bufplce+0]
mov ebx, [bufplce+4]
mov [machmv1+2], eax
mov [machmv4+2], ebx
mov [machmv1+3], eax
mov [machmv4+3], ebx
mov eax, [bufplce+8]
mov ebx, [bufplce+12]
mov [machmv7+2], eax
mov [machmv10+2], ebx
mov [machmv7+3], eax
mov [machmv10+3], ebx
mov eax, [palookupoffse]
mov ebx, [palookupoffse+4]
@ -389,7 +389,6 @@ mvlineasm4:
mov eax, [vince] ;vince
mov ebx, [vince+4]
xor al, al
xor bl, bl
mov [machmv3+2], eax
mov [machmv6+2], ebx
@ -415,37 +414,37 @@ beginmvlineasm4:
mov eax, ebp
mov ebx, esi
machmv16: shr eax, 32
machmv15: shr ebx, 32
machmv12: add ebp, 0x88888888 ;vince[3]
machmv15: shr ebx, 32
machmv9: add esi, 0x88888888 ;vince[2]
machmv10: mov al, [eax+0x88888888] ;bufplce[3]
machmv7: mov bl, [ebx+0x88888888] ;bufplce[2]
cmp al, 1
machmv10: movzx eax, byte [eax+0x88888888];bufplce[3]
machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2]
cmp eax, 1
adc dl, dl
cmp bl, 1
cmp ebx, 1
adc dl, dl
machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2]
machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3]
mov eax, edx
machmv6: add edx, 0x88888888 ;vince[1]
machmv14: shr eax, 32
shl ebx, 16
machmv4: mov al, [eax+0x88888888] ;bufplce[1]
cmp al, 1
machmv4: movzx eax, byte [eax+0x88888888];bufplce[1]
cmp eax, 1
adc dl, dl
machmv6: add edx, 0x88888888 ;vince[1]
machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1]
mov eax, ecx
machmv13: shr eax, 32
machmv3: add ecx, 0x88888888 ;vince[0]
machmv1: mov al, [eax+0x88888888] ;bufplce[0]
cmp al, 1
machmv13: shr eax, 32
machmv1: movzx eax, byte [eax+0x88888888];bufplce[0]
cmp eax, 1
adc dl, dl
machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0]
shl dl, 4
xor eax, eax
shl dl, 4
fixchain2mb: add edi, 320
mov al, dl
add eax, mvcase15