diff --git a/docs/rh-log.txt b/docs/rh-log.txt index db3ce8c45..91edbc390 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,3 +1,24 @@ +May 10, 2006 +- Fixed mvlineasm1 and mvlineasm4 so that they can be used with textures + taller than 256 pixels. There was a very slight performance hit for this, + but I was able to tweak mvlineasm4 to make it approximately as fast as + before. Interestingly, maskwallscan manages to be nearly as fast as + wallscan despite having to check every pixel for transparency. I'm + tempted to dump all the old masked rendering code and use + (trans)maskwallscan for everything for the sake of simplicity: Only + two functions to maintain for each render style, and much less + complicated supporting code. Currently, I need five different functions + for each rendering style: One traditional column-at-a-time style like + Doom did it originally, two for rt_draw4cols, and two for transmaskwallscan. + (Right now, I have cheated, and just done the ones that can be used + by walls for transmaskwallscan, so the actual number of different functions + isn't quite so high.) For small textures, such as font characters and + far-away sprites, I'm sure maskwallscan is faster than the current code. + For large textures, it's probably still competitive even if it isn't faster. + But considering how similar wallscan and maskwallscan perform, the + difference is probably pretty minimal, and maskwallscan still might come + out ahead due to its simpler overhead. + May 10, 2006 (Changes by Graf Zahl) - Fixed: PClass::CreateNew didn't check whether the class had valid defaults and tried to copy data from a NULL pointer. diff --git a/src/a.nas b/src/a.nas index acf00164f..d4c4cf68f 100644 --- a/src/a.nas +++ b/src/a.nas @@ -341,8 +341,8 @@ mvlineasm1: beginmvline: mov ebx, edx maskmach3a: shr ebx, 32 - mov bl, byte [esi+ebx] - cmp bl, 0 + movzx ebx, byte [esi+ebx] + cmp ebx, 0 je short skipmask1 maskmach3c: mov bl, byte [ebp+ebx] mov [edi], bl @@ -371,12 +371,12 @@ mvlineasm4: mov eax, [bufplce+0] mov ebx, [bufplce+4] - mov [machmv1+2], eax - mov [machmv4+2], ebx + mov [machmv1+3], eax + mov [machmv4+3], ebx mov eax, [bufplce+8] mov ebx, [bufplce+12] - mov [machmv7+2], eax - mov [machmv10+2], ebx + mov [machmv7+3], eax + mov [machmv10+3], ebx mov eax, [palookupoffse] mov ebx, [palookupoffse+4] @@ -389,7 +389,6 @@ mvlineasm4: mov eax, [vince] ;vince mov ebx, [vince+4] - xor al, al xor bl, bl mov [machmv3+2], eax mov [machmv6+2], ebx @@ -415,37 +414,37 @@ beginmvlineasm4: mov eax, ebp mov ebx, esi machmv16: shr eax, 32 -machmv15: shr ebx, 32 machmv12: add ebp, 0x88888888 ;vince[3] +machmv15: shr ebx, 32 machmv9: add esi, 0x88888888 ;vince[2] -machmv10: mov al, [eax+0x88888888] ;bufplce[3] -machmv7: mov bl, [ebx+0x88888888] ;bufplce[2] - cmp al, 1 +machmv10: movzx eax, byte [eax+0x88888888];bufplce[3] +machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2] + cmp eax, 1 adc dl, dl - cmp bl, 1 + cmp ebx, 1 adc dl, dl machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2] machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3] mov eax, edx +machmv6: add edx, 0x88888888 ;vince[1] machmv14: shr eax, 32 shl ebx, 16 -machmv4: mov al, [eax+0x88888888] ;bufplce[1] - cmp al, 1 +machmv4: movzx eax, byte [eax+0x88888888];bufplce[1] + cmp eax, 1 adc dl, dl -machmv6: add edx, 0x88888888 ;vince[1] machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1] mov eax, ecx -machmv13: shr eax, 32 machmv3: add ecx, 0x88888888 ;vince[0] -machmv1: mov al, [eax+0x88888888] ;bufplce[0] - cmp al, 1 +machmv13: shr eax, 32 +machmv1: movzx eax, byte [eax+0x88888888];bufplce[0] + cmp eax, 1 adc dl, dl machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0] - shl dl, 4 xor eax, eax + shl dl, 4 fixchain2mb: add edi, 320 mov al, dl add eax, mvcase15