quakeforge/libs/video/renderer/sw/d_parta.S
Bill Currie 0c437492b4 [renderer] Move to using dynamic frame buffers
For now, OpenGL and Vulkan renderers are broken as I focused on getting
the software renderer working (which was quite tricky to get right).

This fixes a couple of issues: the segfault when warping the screen (due
to the scene rendering move invalidating the warp buffer), and warp
always having 320x200 resolution. There's still the problem of the
effect being too subtle at high resolution, but that's just a matter of
updating the tables and tweaking the code in D_WarpScreen.

Another issue is the Draw functions should probably write directly to
the main frame buffer or even one passed in as a parameter. This would
remove the need for binding the main buffer at the beginning and end of
the frame.
2022-03-24 12:56:29 +09:00

491 lines
10 KiB
ArmAsm

/*
d_parta.S
x86 assembly-language 8-bpp particle-drawing code.
Copyright (C) 1996-1997 Id Software, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to:
Free Software Foundation, Inc.
59 Temple Place - Suite 330
Boston, MA 02111-1307, USA
$Id$
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "asm_i386.h"
#include "quakeasm.h"
#include "d_ifacea.h"
#include "asm_draw.h"
#ifdef PIC
#undef USE_INTEL_ASM //XXX asm pic hack
#endif
#ifdef USE_INTEL_ASM
//----------------------------------------------------------------------
// 8-bpp particle drawing code.
//----------------------------------------------------------------------
//FIXME: comments, full optimization
//----------------------------------------------------------------------
// 8-bpp particle queueing code.
//----------------------------------------------------------------------
.text
#define P 12+4
.align 4
.globl C(D_DrawParticle)
C(D_DrawParticle):
pushl %ebp // preserve caller's stack frame
pushl %edi // preserve register variables
pushl %ebx
movl P(%esp),%edi
// FIXME: better FP overlap in general here
// transform point
// VectorSubtract (p->org, r_porigin, local);
flds C(r_porigin)
fsubrs pt_pos(%edi)
flds pt_pos+4(%edi)
fsubs C(r_porigin)+4
flds pt_pos+8(%edi)
fsubs C(r_porigin)+8
fxch %st(2) // local[0] | local[1] | local[2]
// transformed[2] = DotProduct(local, r_ppn);
flds C(r_ppn) // r_ppn[0] | local[0] | local[1] | local[2]
fmul %st(1),%st(0) // dot0 | local[0] | local[1] | local[2]
flds C(r_ppn)+4 // r_ppn[1] | dot0 | local[0] | local[1] | local[2]
fmul %st(3),%st(0) // dot1 | dot0 | local[0] | local[1] | local[2]
flds C(r_ppn)+8 // r_ppn[2] | dot1 | dot0 | local[0] |
// local[1] | local[2]
fmul %st(5),%st(0) // dot2 | dot1 | dot0 | local[0] | local[1] | local[2]
fxch %st(2) // dot0 | dot1 | dot2 | local[0] | local[1] | local[2]
faddp %st(0),%st(1) // dot0 + dot1 | dot2 | local[0] | local[1] |
// local[2]
faddp %st(0),%st(1) // z | local[0] | local[1] | local[2]
fld %st(0) // z | z | local[0] | local[1] |
// local[2]
fdivrs C(float_1) // 1/z | z | local[0] | local[1] | local[2]
fxch %st(1) // z | 1/z | local[0] | local[1] | local[2]
// if (transformed[2] < PARTICLE_Z_CLIP)
// return;
fcomps C(float_particle_z_clip) // 1/z | local[0] | local[1] | local[2]
fxch %st(3) // local[2] | local[0] | local[1] | 1/z
flds C(r_pup) // r_pup[0] | local[2] | local[0] | local[1] | 1/z
fmul %st(2),%st(0) // dot0 | local[2] | local[0] | local[1] | 1/z
flds C(r_pup)+4 // r_pup[1] | dot0 | local[2] | local[0] |
// local[1] | 1/z
fnstsw %ax
testb $1,%ah
jnz LPop6AndDone
// transformed[1] = DotProduct(local, r_pup);
fmul %st(4),%st(0) // dot1 | dot0 | local[2] | local[0] | local[1] | 1/z
flds C(r_pup)+8 // r_pup[2] | dot1 | dot0 | local[2] |
// local[0] | local[1] | 1/z
fmul %st(3),%st(0) // dot2 | dot1 | dot0 | local[2] | local[0] |
// local[1] | 1/z
fxch %st(2) // dot0 | dot1 | dot2 | local[2] | local[0] |
// local[1] | 1/z
faddp %st(0),%st(1) // dot0 + dot1 | dot2 | local[2] | local[0] |
// local[1] | 1/z
faddp %st(0),%st(1) // y | local[2] | local[0] | local[1] | 1/z
fxch %st(3) // local[1] | local[2] | local[0] | y | 1/z
// transformed[0] = DotProduct(local, r_pright);
fmuls C(r_pright)+4 // dot1 | local[2] | local[0] | y | 1/z
fxch %st(2) // local[0] | local[2] | dot1 | y | 1/z
fmuls C(r_pright) // dot0 | local[2] | dot1 | y | 1/z
fxch %st(1) // local[2] | dot0 | dot1 | y | 1/z
fmuls C(r_pright)+8 // dot2 | dot0 | dot1 | y | 1/z
fxch %st(2) // dot1 | dot0 | dot2 | y | 1/z
faddp %st(0),%st(1) // dot1 + dot0 | dot2 | y | 1/z
faddp %st(0),%st(1) // x | y | 1/z
fxch %st(1) // y | x | 1/z
// project the point
fmul %st(2),%st(0) // y/z | x | 1/z
fxch %st(1) // x | y/z | 1/z
fmul %st(2),%st(0) // x/z | y/z | 1/z
fxch %st(1) // y/z | x/z | 1/z
fsubrs C(ycenter) // v | x/z | 1/z
fxch %st(1) // x/z | v | 1/z
fadds C(xcenter) // u | v | 1/z
// FIXME: preadjust xcenter and ycenter
fxch %st(1) // v | u | 1/z
fadds C(float_point5) // v | u | 1/z
fxch %st(1) // u | v | 1/z
fadds C(float_point5) // u | v | 1/z
fxch %st(2) // 1/z | v | u
fmuls C(DP_32768) // 1/z * 0x8000 | v | u
fxch %st(2) // u | v | 1/z * 0x8000
// FIXME: use Terje's fp->int trick here?
// FIXME: check we're getting proper rounding here
fistpl C(DP_u) // v | 1/z * 0x8000
fistpl C(DP_v) // 1/z * 0x8000
movl C(DP_u),%eax
movl C(DP_v),%edx
// if ((v > d_vrectbottom_particle) ||
// (u > d_vrectright_particle) ||
// (v < d_vrecty) ||
// (u < d_vrectx))
// {
// continue;
// }
movl C(d_vrectbottom_particle),%ebx
movl C(d_vrectright_particle),%ecx
cmpl %ebx,%edx
jg LPop1AndDone
cmpl %ecx,%eax
jg LPop1AndDone
movl C(d_vrecty),%ebx
movl C(d_vrectx),%ecx
cmpl %ebx,%edx
jl LPop1AndDone
cmpl %ecx,%eax
jl LPop1AndDone
flds pt_color(%edi) // color | 1/z * 0x8000
fstps C(DP_Color) // 1/z * 0x8000
movl C(d_viewbuffer),%ebx
addl %eax,%ebx
movl C(d_scantable)(,%edx,4),%edi // point to the pixel
imull C(d_zrowbytes),%edx // point to the z pixel
leal (%edx,%eax,2),%edx
movl C(d_zbuffer),%eax
fistpl C(izi)
addl %ebx,%edi
addl %eax,%edx
// pix = izi >> d_pix_shift;
movl C(izi),%eax
movl C(d_pix_shift),%ecx
shrl %cl,%eax
movl C(izi),%ebp
// if (pix < d_pix_min)
// pix = d_pix_min;
// else if (pix > d_pix_max)
// pix = d_pix_max;
movl C(d_pix_min),%ebx
movl C(d_pix_max),%ecx
cmpl %ebx,%eax
jnl LTestPixMax
movl %ebx,%eax
jmp LTestDone
LTestPixMax:
cmpl %ecx,%eax
jng LTestDone
movl %ecx,%eax
LTestDone:
movb C(DP_Color),%ch
movl C(d_y_aspect_shift),%ebx
testl %ebx,%ebx
jnz LDefault
cmpl $4,%eax
ja LDefault
jmp *C(DP_EntryTable)-4(,%eax,4)
// 1x1
.globl C(DP_1x1)
C(DP_1x1):
cmpw %bp,(%edx) // just one pixel to do
jg LDone
movw %bp,(%edx)
movb %ch,(%edi)
jmp LDone
// 2x2
.globl C(DP_2x2)
C(DP_2x2):
pushl %esi
movl C(d_rowbytes),%ebx
movl C(d_zrowbytes),%esi
cmpw %bp,(%edx)
jg L2x2_1
movw %bp,(%edx)
movb %ch,(%edi)
L2x2_1:
cmpw %bp,2(%edx)
jg L2x2_2
movw %bp,2(%edx)
movb %ch,1(%edi)
L2x2_2:
cmpw %bp,(%edx,%esi,1)
jg L2x2_3
movw %bp,(%edx,%esi,1)
movb %ch,(%edi,%ebx,1)
L2x2_3:
cmpw %bp,2(%edx,%esi,1)
jg L2x2_4
movw %bp,2(%edx,%esi,1)
movb %ch,1(%edi,%ebx,1)
L2x2_4:
popl %esi
jmp LDone
// 3x3
.globl C(DP_3x3)
C(DP_3x3):
pushl %esi
movl C(d_rowbytes),%ebx
movl C(d_zrowbytes),%esi
cmpw %bp,(%edx)
jg L3x3_1
movw %bp,(%edx)
movb %ch,(%edi)
L3x3_1:
cmpw %bp,2(%edx)
jg L3x3_2
movw %bp,2(%edx)
movb %ch,1(%edi)
L3x3_2:
cmpw %bp,4(%edx)
jg L3x3_3
movw %bp,4(%edx)
movb %ch,2(%edi)
L3x3_3:
cmpw %bp,(%edx,%esi,1)
jg L3x3_4
movw %bp,(%edx,%esi,1)
movb %ch,(%edi,%ebx,1)
L3x3_4:
cmpw %bp,2(%edx,%esi,1)
jg L3x3_5
movw %bp,2(%edx,%esi,1)
movb %ch,1(%edi,%ebx,1)
L3x3_5:
cmpw %bp,4(%edx,%esi,1)
jg L3x3_6
movw %bp,4(%edx,%esi,1)
movb %ch,2(%edi,%ebx,1)
L3x3_6:
cmpw %bp,(%edx,%esi,2)
jg L3x3_7
movw %bp,(%edx,%esi,2)
movb %ch,(%edi,%ebx,2)
L3x3_7:
cmpw %bp,2(%edx,%esi,2)
jg L3x3_8
movw %bp,2(%edx,%esi,2)
movb %ch,1(%edi,%ebx,2)
L3x3_8:
cmpw %bp,4(%edx,%esi,2)
jg L3x3_9
movw %bp,4(%edx,%esi,2)
movb %ch,2(%edi,%ebx,2)
L3x3_9:
popl %esi
jmp LDone
// 4x4
.globl C(DP_4x4)
C(DP_4x4):
pushl %esi
movl C(d_rowbytes),%ebx
movl C(d_zrowbytes),%esi
cmpw %bp,(%edx)
jg L4x4_1
movw %bp,(%edx)
movb %ch,(%edi)
L4x4_1:
cmpw %bp,2(%edx)
jg L4x4_2
movw %bp,2(%edx)
movb %ch,1(%edi)
L4x4_2:
cmpw %bp,4(%edx)
jg L4x4_3
movw %bp,4(%edx)
movb %ch,2(%edi)
L4x4_3:
cmpw %bp,6(%edx)
jg L4x4_4
movw %bp,6(%edx)
movb %ch,3(%edi)
L4x4_4:
cmpw %bp,(%edx,%esi,1)
jg L4x4_5
movw %bp,(%edx,%esi,1)
movb %ch,(%edi,%ebx,1)
L4x4_5:
cmpw %bp,2(%edx,%esi,1)
jg L4x4_6
movw %bp,2(%edx,%esi,1)
movb %ch,1(%edi,%ebx,1)
L4x4_6:
cmpw %bp,4(%edx,%esi,1)
jg L4x4_7
movw %bp,4(%edx,%esi,1)
movb %ch,2(%edi,%ebx,1)
L4x4_7:
cmpw %bp,6(%edx,%esi,1)
jg L4x4_8
movw %bp,6(%edx,%esi,1)
movb %ch,3(%edi,%ebx,1)
L4x4_8:
leal (%edx,%esi,2),%edx
leal (%edi,%ebx,2),%edi
cmpw %bp,(%edx)
jg L4x4_9
movw %bp,(%edx)
movb %ch,(%edi)
L4x4_9:
cmpw %bp,2(%edx)
jg L4x4_10
movw %bp,2(%edx)
movb %ch,1(%edi)
L4x4_10:
cmpw %bp,4(%edx)
jg L4x4_11
movw %bp,4(%edx)
movb %ch,2(%edi)
L4x4_11:
cmpw %bp,6(%edx)
jg L4x4_12
movw %bp,6(%edx)
movb %ch,3(%edi)
L4x4_12:
cmpw %bp,(%edx,%esi,1)
jg L4x4_13
movw %bp,(%edx,%esi,1)
movb %ch,(%edi,%ebx,1)
L4x4_13:
cmpw %bp,2(%edx,%esi,1)
jg L4x4_14
movw %bp,2(%edx,%esi,1)
movb %ch,1(%edi,%ebx,1)
L4x4_14:
cmpw %bp,4(%edx,%esi,1)
jg L4x4_15
movw %bp,4(%edx,%esi,1)
movb %ch,2(%edi,%ebx,1)
L4x4_15:
cmpw %bp,6(%edx,%esi,1)
jg L4x4_16
movw %bp,6(%edx,%esi,1)
movb %ch,3(%edi,%ebx,1)
L4x4_16:
popl %esi
jmp LDone
// default case, handling any size particle
LDefault:
// count = pix << d_y_aspect_shift;
movl %eax,%ebx
movl %eax,C(DP_Pix)
movb C(d_y_aspect_shift),%cl
shll %cl,%ebx
// for ( ; count ; count--, pz += d_zwidth, pdest += d_rowbytes)
// {
// for (i=0 ; i<pix ; i++)
// {
// if (pz[i] <= izi)
// {
// pz[i] = izi;
// pdest[i] = color;
// }
// }
// }
LGenRowLoop:
movl C(DP_Pix),%eax
LGenColLoop:
cmpw %bp,-2(%edx,%eax,2)
jg LGSkip
movw %bp,-2(%edx,%eax,2)
movb %ch,-1(%edi,%eax,1)
LGSkip:
decl %eax // --pix
jnz LGenColLoop
addl C(d_zrowbytes),%edx
addl C(d_rowbytes),%edi
decl %ebx // --count
jnz LGenRowLoop
LDone:
popl %ebx // restore register variables
popl %edi
popl %ebp // restore the caller's stack frame
ret
LPop6AndDone:
fstp %st(0)
fstp %st(0)
fstp %st(0)
fstp %st(0)
fstp %st(0)
LPop1AndDone:
fstp %st(0)
jmp LDone
#endif // USE_INTEL_ASM
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif