From 0069ca4072fd191666cd28d531961c6992fdac71 Mon Sep 17 00:00:00 2001 From: Randy Heit Date: Thu, 11 May 2006 01:15:15 +0000 Subject: [PATCH] - Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures taller than 256 pixels. There was a very slight performance hit for this, but I was able to tweak mvlineasm4 to make it approximately as fast as before. Interestingly, maskwallscan manages to be nearly as fast as wallscan despite having to check every pixel for transparency. I'm tempted to dump all the old masked rendering code and use (trans)maskwallscan for everything for the sake of simplicity: Only two functions to maintain for each render style, and much less complicated supporting code. Currently, I need five different functions for each rendering style: One traditional column-at-a-time style like Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan. (Right now, I have cheated, and just done the ones that can be used by walls for transmaskwallscan, so the actual number of different functions isn't quite so high.) For small textures, such as font characters and far-away sprites, I'm sure maskwallscan is faster than the current code. For large textures, it's probably still competitive even if it isn't faster. But considering how similar wallscan and maskwallscan perform, the difference is probably pretty minimal, and maskwallscan still might come out ahead due to its simpler overhead. SVN r105 (trunk) --- docs/rh-log.txt | 21 +++++++++++++++++++++ src/a.nas | 37 ++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/docs/rh-log.txt b/docs/rh-log.txt index db3ce8c45..91edbc390 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,3 +1,24 @@ +May 10, 2006 +- Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures + taller than 256 pixels. There was a very slight performance hit for this, + but I was able to tweak mvlineasm4 to make it approximately as fast as + before. Interestingly, maskwallscan manages to be nearly as fast as + wallscan despite having to check every pixel for transparency. I'm + tempted to dump all the old masked rendering code and use + (trans)maskwallscan for everything for the sake of simplicity: Only + two functions to maintain for each render style, and much less + complicated supporting code. Currently, I need five different functions + for each rendering style: One traditional column-at-a-time style like + Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan. + (Right now, I have cheated, and just done the ones that can be used + by walls for transmaskwallscan, so the actual number of different functions + isn't quite so high.) For small textures, such as font characters and + far-away sprites, I'm sure maskwallscan is faster than the current code. + For large textures, it's probably still competitive even if it isn't faster. + But considering how similar wallscan and maskwallscan perform, the + difference is probably pretty minimal, and maskwallscan still might come + out ahead due to its simpler overhead. + May 10, 2006 (Changes by Graf Zahl) - Fixed: PClass::CreateNew didn't check whether the class had valid defaults and tried to copy data from a NULL pointer. diff --git a/src/a.nas b/src/a.nas index acf00164f..d4c4cf68f 100644 --- a/src/a.nas +++ b/src/a.nas @@ -341,8 +341,8 @@ mvlineasm1: beginmvline: mov ebx, edx maskmach3a: shr ebx, 32 - mov bl, byte [esi+ebx] - cmp bl, 0 + movzx ebx, byte [esi+ebx] + cmp ebx, 0 je short skipmask1 maskmach3c: mov bl, byte [ebp+ebx] mov [edi], bl @@ -371,12 +371,12 @@ mvlineasm4: mov eax, [bufplce+0] mov ebx, [bufplce+4] - mov [machmv1+2], eax - mov [machmv4+2], ebx + mov [machmv1+3], eax + mov [machmv4+3], ebx mov eax, [bufplce+8] mov ebx, [bufplce+12] - mov [machmv7+2], eax - mov [machmv10+2], ebx + mov [machmv7+3], eax + mov [machmv10+3], ebx mov eax, [palookupoffse] mov ebx, [palookupoffse+4] @@ -389,7 +389,6 @@ mvlineasm4: mov eax, [vince] ;vince mov ebx, [vince+4] - xor al, al xor bl, bl mov [machmv3+2], eax mov [machmv6+2], ebx @@ -415,37 +414,37 @@ beginmvlineasm4: mov eax, ebp mov ebx, esi machmv16: shr eax, 32 -machmv15: shr ebx, 32 machmv12: add ebp, 0x88888888 ;vince[3] +machmv15: shr ebx, 32 machmv9: add esi, 0x88888888 ;vince[2] -machmv10: mov al, [eax+0x88888888] ;bufplce[3] -machmv7: mov bl, [ebx+0x88888888] ;bufplce[2] - cmp al, 1 +machmv10: movzx eax, byte [eax+0x88888888];bufplce[3] +machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2] + cmp eax, 1 adc dl, dl - cmp bl, 1 + cmp ebx, 1 adc dl, dl machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2] machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3] mov eax, edx +machmv6: add edx, 0x88888888 ;vince[1] machmv14: shr eax, 32 shl ebx, 16 -machmv4: mov al, [eax+0x88888888] ;bufplce[1] - cmp al, 1 +machmv4: movzx eax, byte [eax+0x88888888];bufplce[1] + cmp eax, 1 adc dl, dl -machmv6: add edx, 0x88888888 ;vince[1] machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1] mov eax, ecx -machmv13: shr eax, 32 machmv3: add ecx, 0x88888888 ;vince[0] -machmv1: mov al, [eax+0x88888888] ;bufplce[0] - cmp al, 1 +machmv13: shr eax, 32 +machmv1: movzx eax, byte [eax+0x88888888];bufplce[0] + cmp eax, 1 adc dl, dl machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0] - shl dl, 4 xor eax, eax + shl dl, 4 fixchain2mb: add edi, 320 mov al, dl add eax, mvcase15