- Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures

taller than 256 pixels. There was a very slight performance hit for this,
  but I was able to tweak mvlineasm4 to make it approximately as fast as
  before. Interestingly, maskwallscan manages to be nearly as fast as
  wallscan despite having to check every pixel for transparency. I'm
  tempted to dump all the old masked rendering code and use
  (trans)maskwallscan for everything for the sake of simplicity: Only
  two functions to maintain for each render style, and much less
  complicated supporting code. Currently, I need five different functions
  for each rendering style: One traditional column-at-a-time style like
  Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan.
  (Right now, I have cheated, and just done the ones that can be used
  by walls for transmaskwallscan, so the actual number of different functions
  isn't quite so high.) For small textures, such as font characters and
  far-away sprites, I'm sure maskwallscan is faster than the current code.
  For large textures, it's probably still competitive even if it isn't faster.
  But considering how similar wallscan and maskwallscan perform, the
  difference is probably pretty minimal, and maskwallscan still might come
  out ahead due to its simpler overhead.


SVN r105 (trunk)
This commit is contained in:
Randy Heit 2006-05-11 01:15:15 +00:00
parent ee12c25f47
commit 0069ca4072
2 changed files with 39 additions and 19 deletions

View file

@ -1,3 +1,24 @@
May 10, 2006
- Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures
taller than 256 pixels. There was a very slight performance hit for this,
but I was able to tweak mvlineasm4 to make it approximately as fast as
before. Interestingly, maskwallscan manages to be nearly as fast as
wallscan despite having to check every pixel for transparency. I'm
tempted to dump all the old masked rendering code and use
(trans)maskwallscan for everything for the sake of simplicity: Only
two functions to maintain for each render style, and much less
complicated supporting code. Currently, I need five different functions
for each rendering style: One traditional column-at-a-time style like
Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan.
(Right now, I have cheated, and just done the ones that can be used
by walls for transmaskwallscan, so the actual number of different functions
isn't quite so high.) For small textures, such as font characters and
far-away sprites, I'm sure maskwallscan is faster than the current code.
For large textures, it's probably still competitive even if it isn't faster.
But considering how similar wallscan and maskwallscan perform, the
difference is probably pretty minimal, and maskwallscan still might come
out ahead due to its simpler overhead.
May 10, 2006 (Changes by Graf Zahl) May 10, 2006 (Changes by Graf Zahl)
- Fixed: PClass::CreateNew didn't check whether the class had valid - Fixed: PClass::CreateNew didn't check whether the class had valid
defaults and tried to copy data from a NULL pointer. defaults and tried to copy data from a NULL pointer.

View file

@ -341,8 +341,8 @@ mvlineasm1:
beginmvline: beginmvline:
mov ebx, edx mov ebx, edx
maskmach3a: shr ebx, 32 maskmach3a: shr ebx, 32
mov bl, byte [esi+ebx] movzx ebx, byte [esi+ebx]
cmp bl, 0 cmp ebx, 0
je short skipmask1 je short skipmask1
maskmach3c: mov bl, byte [ebp+ebx] maskmach3c: mov bl, byte [ebp+ebx]
mov [edi], bl mov [edi], bl
@ -371,12 +371,12 @@ mvlineasm4:
mov eax, [bufplce+0] mov eax, [bufplce+0]
mov ebx, [bufplce+4] mov ebx, [bufplce+4]
mov [machmv1+2], eax mov [machmv1+3], eax
mov [machmv4+2], ebx mov [machmv4+3], ebx
mov eax, [bufplce+8] mov eax, [bufplce+8]
mov ebx, [bufplce+12] mov ebx, [bufplce+12]
mov [machmv7+2], eax mov [machmv7+3], eax
mov [machmv10+2], ebx mov [machmv10+3], ebx
mov eax, [palookupoffse] mov eax, [palookupoffse]
mov ebx, [palookupoffse+4] mov ebx, [palookupoffse+4]
@ -389,7 +389,6 @@ mvlineasm4:
mov eax, [vince] ;vince mov eax, [vince] ;vince
mov ebx, [vince+4] mov ebx, [vince+4]
xor al, al
xor bl, bl xor bl, bl
mov [machmv3+2], eax mov [machmv3+2], eax
mov [machmv6+2], ebx mov [machmv6+2], ebx
@ -415,37 +414,37 @@ beginmvlineasm4:
mov eax, ebp mov eax, ebp
mov ebx, esi mov ebx, esi
machmv16: shr eax, 32 machmv16: shr eax, 32
machmv15: shr ebx, 32
machmv12: add ebp, 0x88888888 ;vince[3] machmv12: add ebp, 0x88888888 ;vince[3]
machmv15: shr ebx, 32
machmv9: add esi, 0x88888888 ;vince[2] machmv9: add esi, 0x88888888 ;vince[2]
machmv10: mov al, [eax+0x88888888] ;bufplce[3] machmv10: movzx eax, byte [eax+0x88888888];bufplce[3]
machmv7: mov bl, [ebx+0x88888888] ;bufplce[2] machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2]
cmp al, 1 cmp eax, 1
adc dl, dl adc dl, dl
cmp bl, 1 cmp ebx, 1
adc dl, dl adc dl, dl
machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2] machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2]
machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3] machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3]
mov eax, edx mov eax, edx
machmv6: add edx, 0x88888888 ;vince[1]
machmv14: shr eax, 32 machmv14: shr eax, 32
shl ebx, 16 shl ebx, 16
machmv4: mov al, [eax+0x88888888] ;bufplce[1] machmv4: movzx eax, byte [eax+0x88888888];bufplce[1]
cmp al, 1 cmp eax, 1
adc dl, dl adc dl, dl
machmv6: add edx, 0x88888888 ;vince[1]
machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1] machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1]
mov eax, ecx mov eax, ecx
machmv13: shr eax, 32
machmv3: add ecx, 0x88888888 ;vince[0] machmv3: add ecx, 0x88888888 ;vince[0]
machmv1: mov al, [eax+0x88888888] ;bufplce[0] machmv13: shr eax, 32
cmp al, 1 machmv1: movzx eax, byte [eax+0x88888888];bufplce[0]
cmp eax, 1
adc dl, dl adc dl, dl
machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0] machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0]
shl dl, 4
xor eax, eax xor eax, eax
shl dl, 4
fixchain2mb: add edi, 320 fixchain2mb: add edi, 320
mov al, dl mov al, dl
add eax, mvcase15 add eax, mvcase15