Blitter optimizations and rounding fixes.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/back/trunk@18360 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Alexander Malmberg 2004-01-10 15:37:13 +00:00
parent 27190e1de4
commit 2291618e83
3 changed files with 418 additions and 32 deletions

View file

@ -1,3 +1,16 @@
2004-01-10 16:25 Alexander Malmberg <alexander@malmberg.org>
* Source/art/blit.m: Replace uses DI_16_B5G5R5A1 and DI_16_B5G6R5
with uses of DI_16_B5_G5_R5_A1 and DI_16_B5_G6_R5.
(sover_ao): Add an optimized version for the 16/15 bpp modes.
(satop_aa): Simplify calculation of da'.
(datop_aa): Simplify calculation of da'. Fix the rounding.
(xor_aa): Fix the rounding.
(DI_16_B5_G5_R5_A1, DI_16_B5_G6_R5): Unpack pixels in a more
efficient way.
* Source/art/blit_scrapheap.m: New file.
2004-01-07 14:51 Alexander Malmberg <alexander@malmberg.org>
* Source/art/composite.m (-_composite_func::::::): If the source has

View file

@ -1,5 +1,5 @@
/*
Copyright (C) 2002 Free Software Foundation, Inc.
Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
Author: Alexander Malmberg <alexander@malmberg.org>
@ -47,6 +47,13 @@ still need to fix all the remaining spots
rounding is way off (but not more than 1, hopefully) for near-boundary
cases. at least pure black stays pure black and pure white stays pure white
2004-01-10: Test suite now says that all composite functions are 'correct',
in the sense that the difference from the correct result is never larger
than 0.5, and that's the best I can get with finite precision'. :)
(However, should probably check whether the results here always match the
correctly rounded correct result.)
TODO: (optional?) proper gamma handling?
@ -275,7 +282,7 @@ static void MPRE(blit_mono_a) (unsigned char *adst, unsigned char *dsta,
nb = (b * a + nb * (255 - a) + 0xff) >> 8;
BLEND_WRITE(dst, nr, ng, nb)
BLEND_INC(dst)
}
}
else
{
BLEND_INC(dst)
@ -336,7 +343,7 @@ static void MPRE(blit_subpixel) (unsigned char *adst, const unsigned char *asrc,
static void MPRE(run_opaque) (render_run_t *ri, int num)
{
#if FORMAT_HOW == DI_16_B5G5R5A1 || FORMAT_HOW == DI_16_B5G6R5
#if FORMAT_HOW == DI_16_B5_G5_R5_A1 || FORMAT_HOW == DI_16_B5_G6_R5
unsigned int v;
unsigned short *dst = (unsigned short *)ri->dst;
@ -495,6 +502,141 @@ static void MPRE(sover_aa) (composite_run_t *c, int num)
static void MPRE(sover_ao) (composite_run_t *c, int num)
{
#if FORMAT_HOW == DI_16_B5_G6_R5 || FORMAT_HOW == DI_16_B5_G5_R5_A1
BLEND_TYPE *s = (BLEND_TYPE *)c->src, *d = (BLEND_TYPE *)c->dst;
unsigned char *src_alpha = c->srca;
int sr, sg, sb, sa;
#undef i386
#ifndef i386
int dr, dg, db;
#endif
unsigned int temp;
for (; num; num--)
{
ALPHA_READ(s, src_alpha, sa)
if (!sa)
{
ALPHA_INC(s, src_alpha)
BLEND_INC(d)
continue;
}
if (sa == 255)
{
BLEND_READ(s, sr, sg, sb)
BLEND_WRITE(d, sr, sg, sb)
ALPHA_INC(s, src_alpha)
BLEND_INC(d)
continue;
}
#ifdef i386
/*
The basic idea here is to scale all components using one multiply,
and to do so without losing any accuracy. To do this, we move the
components around so we get 8 empty bits above each component. In
a 32-bit word, we don't have enough space for this, but by moving
green to the top 6 bits and using ix86 'mul', we'll get the top 32
bits of the multiplication in another register and can later recombine
the components relatively easily.
Mostly equivalent c (16bpp case):
unsigned long long int temp;
sa = 255-sa;
temp = d[0];
temp = ((temp|(temp<<21))&0xfc00001f)|((temp&0xf800)<<2);
temp = temp*sa;
temp = temp+0x000000020003e01fLL;
temp = temp>>8;
temp = (temp&0x1f) | ((temp&0x3e000)>>2) | ((temp&0xfc000000)>>21);
d[0]=temp + s[0];
16bpp:
original: 0000 0000 0000 0000 bbbb bggg gggr rrrr
after unpacking: gggg gg00 0000 00bb bbb0 0000 000r rrrr
after 'mul': .... 0000 0000 gggg gggg gggg ggbb bbbb bbbb bbbr rrrr rrrr rrrr
15bpp:
original: 0000 0000 0000 0000 0bbb bbgg gggr rrrr
after unpacking: gggg g000 0000 00bb bbb0 0000 000r rrrr
after 'mul': .... 0000 0000 gggg gggg gggg g0bb bbbb bbbb bbbr rrrr rrrr rrrr
*/
temp = d[0];
sa = 255 - sa;
asm (
"movl %0,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"shll $21,%%eax\n"
#else
"shll $22,%%eax\n"
#endif
"orl %0,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"andl $0xfc00001f,%%eax\n"
"andl $0xf800,%0\n"
"shll $2,%0\n"
#else
"andl $0xf800001f,%%eax\n"
"andl $0x7c00,%0\n"
"shll $3,%0\n"
#endif
"orl %0,%%eax\n"
"mul %2\n"
"addl $0x0003e01f,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"addl $0x02,%%edx\n"
"andl $0xfc,%%edx\n"
"shll $3,%%edx\n"
#else
"addl $0x01,%%edx\n"
"andl $0xf8,%%edx\n"
"shll $2,%%edx\n"
#endif
"shrl $8,%%eax\n"
"movl %%eax,%0\n"
"andl $0x1f,%0\n"
"orl %%edx,%0\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"shrl $2,%%eax\n"
"andl $0xf800,%%eax\n"
#else
"shrl $3,%%eax\n"
"andl $0x7c00,%%eax\n"
#endif
"orl %%eax,%0\n"
: "=r" (temp)
: "0" (temp), "g" (sa)
: "eax", "edx");
d[0] = temp + s[0];
#else
/*
Generic, non-ix86 code. Can't use the really optimized path, but
we can still add in the entire source pixel instead of unpacking
it.
*/
BLEND_READ(d, dr, dg, db)
sa = 255 - sa;
dr = ((dr * sa + 0xff) >> 8);
dg = ((dg * sa + 0xff) >> 8);
db = ((db * sa + 0xff) >> 8);
COPY_ASSEMBLE_PIXEL(temp, dr, dg, db)
d[0] = temp + s[0];
#endif
ALPHA_INC(s, src_alpha)
BLEND_INC(d)
}
#else
BLEND_TYPE *s = (BLEND_TYPE *)c->src, *d = (BLEND_TYPE *)c->dst;
#ifndef INLINE_ALPHA
unsigned char *src_alpha = c->srca;
@ -532,6 +674,7 @@ static void MPRE(sover_ao) (composite_run_t *c, int num)
ALPHA_INC(s, src_alpha)
BLEND_INC(d)
}
#endif
}
/* dsta : 0 */
@ -749,7 +892,7 @@ static void MPRE(satop_aa) (composite_run_t *c, int num)
dr = (sr * da + dr * sa + 0xff) >> 8;
dg = (sg * da + dg * sa + 0xff) >> 8;
db = (sb * da + db * sa + 0xff) >> 8;
da = ((255 - sa) * da + da * sa + 0xff) >> 8;
/* For alpha, satop simplifies to da' = da. */
BLEND_WRITE_ALPHA(d, dst_alpha, dr, dg, db, da)
@ -977,10 +1120,11 @@ static void MPRE(datop_aa) (composite_run_t *c, int num)
da = 255 - da;
dr = (dr * sa + sr * da + 0x80) >> 8;
dg = (dg * sa + sg * da + 0x80) >> 8;
db = (db * sa + sb * da + 0x80) >> 8;
da = ((255 - da) * sa + sa * da + 0x80) >> 8;
dr = (dr * sa + sr * da + 0xff) >> 8;
dg = (dg * sa + sg * da + 0xff) >> 8;
db = (db * sa + sb * da + 0xff) >> 8;
/* For alpha, datop simplifies to da' = sa. */
da = sa;
BLEND_WRITE_ALPHA(d, dst_alpha, dr, dg, db, da)
@ -1040,10 +1184,10 @@ static void MPRE(xor_aa) (composite_run_t *c, int num)
da = 255 - da;
sa = 255 - sa;
dr = ((dr * sa + sr * da + 0x80) >> 8);
dg = ((dg * sa + sg * da + 0x80) >> 8);
db = ((db * sa + sb * da + 0x80) >> 8);
da = ((da * (255 - sa) + sa * (255 - da) + 0x80) >> 8);
dr = ((dr * sa + sr * da + 0xff) >> 8);
dg = ((dg * sa + sg * da + 0xff) >> 8);
db = ((db * sa + sb * da + 0xff) >> 8);
da = ((da * (255 - sa) + sa * (255 - da) + 0xff) >> 8);
BLEND_WRITE_ALPHA(d, dst_alpha, dr, dg, db, da)
@ -1532,15 +1676,15 @@ ourself.
/* 16-bit 5 bits blue, 6 bits green, 5 bits red */
#define FORMAT_INSTANCE b5g6r5
#define FORMAT_HOW DI_16_B5G6R5
#define FORMAT_HOW DI_16_B5_G6_R5
#warning B5G6R5
#define BLEND_TYPE unsigned short
#define BLEND_READ(p,nr,ng,nb) \
{ \
unsigned short _s=p[0]; \
nr=(_s>>11)<<3; \
ng=((_s>>5)<<2)&0xff; \
nr=(_s>>8); \
ng=(_s>>3)&0xff; \
nb=(_s<<3)&0xff; \
}
#define BLEND_READ_ALPHA(p,pa,nr,ng,nb,na) \
@ -1570,15 +1714,15 @@ ourself.
/* 16-bit 5 bits blue, 5 bits green, 5 bits red */
#define FORMAT_INSTANCE b5g5r5a1
#define FORMAT_HOW DI_16_B5G5R5A1
#define FORMAT_HOW DI_16_B5_G5_R5_A1
#warning B5G5R5A1
#define BLEND_TYPE unsigned short
#define BLEND_READ(p,nr,ng,nb) \
{ \
unsigned short _s=p[0]; \
nr=(_s>>10)<<3; \
ng=((_s>>5)<<3)&0xff; \
nr=(_s>>7); \
ng=(_s>>2)&0xff; \
nb=(_s<<3)&0xff; \
}
#define BLEND_READ_ALPHA(p,pa,nr,ng,nb,na) \
@ -1834,20 +1978,7 @@ Xor 1 - dstA 1 - srcA noop noop
PlusL dst=src+dst , clamp to 1.0; dsta=srca+dsta, clamp to 1.0
PlisD dst=src+dst-1, clamp to 0.0; dsta=srca+dsta, clamp to 1.0
these are incorrect:
PlusD
[PlusD does not follow the general equation. The equation is dst'=(1-dst)+(1-src).
If the result is less than 0 (black), then the result is 0.]
N/A
N/A
PlusL
[For PlusL, the addition saturates. That is, if (src+dst) > white), the result is white.]
1
1
PlusD dst=src+dst-1, clamp to 0.0; dsta=srca+dsta, clamp to 1.0
*/

242
Source/art/blit_scrapheap.m Normal file
View file

@ -0,0 +1,242 @@
/*
Copyright (C) 2004 Free Software Foundation, Inc.
Author: Alexander Malmberg <alexander@malmberg.org>
This file is part of GNUstep.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
This file is never compiled. It just contains alternate and/or partial
implementations of some functions in blit.m that are interesting but not
(in their current form) better than the implementations in blit.m .
*/
/**** blit_alpha_opaque for 16/15 bpp ****/
/*
Do two pixels at once. Turned out to be slower than a straight loop.
*/
const unsigned char *src = asrc;
BLEND_TYPE *dst = (BLEND_TYPE *)adst;
unsigned int nr, ng, nb, a;
/*
bbbb bggg gggr rrrr a0
b bbbb gggg ggrr rrr a1
bb bbbg gggg grrr rr a2
bbb bbgg gggg rrrr r a3
bbbb bggg gggr rrrr a4
b bbbb gggg ggrr rrr a5
bb bbbg gggg grrr rr a6
bbb bbgg gggg rrrr r a7
R rrrr rrrr rrrr
Ggg gggg gggg ggg
Bbbb bbbb bbbb b
bbbb b000 0000 0000 bbbb b000 0000 0000 a0
b bbbb 0000 0000 000b bbbb 0000 0000 000 a1
bb bbb0 0000 0000 00bb bbb0 0000 0000 00 a2
bbb bb00 0000 0000 0bbb bb00 0000 0000 0 a3
bbbb b000 0000 0000 bbbb b000 0000 0000 a4
b bbbb 0000 0000 000b bbbb 0000 0000 000 a5
bb bbb0 0000 0000 00bb bbb0 0000 0000 00 a6
bbb bb00 0000 0000 0bbb bb00 0000 0000 0 a7
bbbb b000 0000 0000 bbbb b000 0000 0000 b0
b bbbb 0000 0000 000b bbbb 0000 0000 000 b1
bb bbb0 0000 0000 00bb bbb0 0000 0000 00 b2
bbb bb00 0000 0000 0bbb bb00 0000 0000 0 b3
bbbb b000 0000 0000 bbbb b000 0000 0000 b4
b bbbb 0000 0000 000b bbbb 0000 0000 000 b5
bb bbb0 0000 0000 00bb bbb0 0000 0000 00 b6
bbb bb00 0000 0000 0bbb bb00 0000 0000 0 b7
BBBB BBBB BBBB B000 XXXX XXXX XXXX XXXX XXXX X000 AAAA AAAA AAAA A000 0000 0000
bbbb bbbb 0000 0000 0000 0000 aaaa aaaa
*/
//printf("call with color=%02x %02x %02x\n",r,g,b);
while (num >= 2)
{
unsigned int v1,v2;
unsigned int a1;
unsigned int temp;
a = *((unsigned short *)src);
// printf("alpha=%04x\n",a);
a = a&0xffff;
/* if (!a)
{
num -= 2;
src += 2;
dst += 2;
continue;
}*/
a = (a|(a<<8))&0xff00ff;
// printf("unpack to %08x\n",a);
v1 = b*a;
v1 = (v1>>11)&0x001f001f;
// printf("blue: %08x\n",v1);
v2 = g*a;
v2 = (v2>> 5)&0x07e007e0;
// printf("green: %08x\n",v2);
v1 = v1|v2;
v2 = r*a;
v2 = v2 &0xf800f800;
// printf("red: %08x\n",v2);
v1 = v1|v2;
// printf("result: %08x\n",v1);
a = 0xff00ff - a;
temp = dst[0];
// printf("p1: %04x\n",temp);
asm (
"movl %0,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"shll $21,%%eax\n"
#else
"shll $22,%%eax\n"
#endif
"orl %0,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"andl $0xfc00001f,%%eax\n"
"andl $0xf800,%0\n"
"shll $2,%0\n"
#else
"andl $0xf800001f,%%eax\n"
"andl $0x7c00,%0\n"
"shll $3,%0\n"
#endif
"orl %0,%%eax\n"
"mul %2\n"
"addl $0x0003e01f,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"addl $0x02,%%edx\n"
"andl $0xfc,%%edx\n"
"shll $3,%%edx\n"
#else
"addl $0x01,%%edx\n"
"andl $0xf8,%%edx\n"
"shll $2,%%edx\n"
#endif
"shrl $8,%%eax\n"
"movl %%eax,%0\n"
"andl $0x1f,%0\n"
"orl %%edx,%0\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"shrl $2,%%eax\n"
"andl $0xf800,%%eax\n"
#else
"shrl $3,%%eax\n"
"andl $0x7c00,%%eax\n"
#endif
"orl %%eax,%0\n"
: "=r" (temp)
: "0" (temp), "c" (a&0xff)
: "eax", "edx");
// printf("to: %04x\n",temp);
v1 += temp;
// printf("add in gives: %08x\n",v1);
temp = dst[1];
// printf("p2: %04x\n",temp);
asm (
"movl %0,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"shll $21,%%eax\n"
#else
"shll $22,%%eax\n"
#endif
"orl %0,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"andl $0xfc00001f,%%eax\n"
"andl $0xf800,%0\n"
"shll $2,%0\n"
#else
"andl $0xf800001f,%%eax\n"
"andl $0x7c00,%0\n"
"shll $3,%0\n"
#endif
"orl %0,%%eax\n"
"mul %2\n"
"addl $0x0003e01f,%%eax\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"addl $0x02,%%edx\n"
"andl $0xfc,%%edx\n"
"shll $3,%%edx\n"
#else
"addl $0x01,%%edx\n"
"andl $0xf8,%%edx\n"
"shll $2,%%edx\n"
#endif
"shrl $8,%%eax\n"
"movl %%eax,%0\n"
"andl $0x1f,%0\n"
"orl %%edx,%0\n"
#if FORMAT_HOW == DI_16_B5_G6_R5
"shrl $2,%%eax\n"
"andl $0xf800,%%eax\n"
#else
"shrl $3,%%eax\n"
"andl $0x7c00,%%eax\n"
#endif
"orl %%eax,%0\n"
: "=r" (temp)
: "0" (temp), "c" (a>>16)
: "eax", "edx");
// printf("to: %04x\n",temp);
v1 += (temp)<<16;
// printf("add in gives: %08x\n",v1);
*((unsigned int *)dst)=v1;
num -= 2;
dst += 2;
src += 2;
}
if (num)
{
a = *src;
// if (a)
{
BLEND_READ(dst, nr, ng, nb)
/* nr = inv_gamma_table[nr];
ng = inv_gamma_table[ng];
nb = inv_gamma_table[nb];*/
nr = (r * a + nr * (255 - a) + 0xff) >> 8;
ng = (g * a + ng * (255 - a) + 0xff) >> 8;
nb = (b * a + nb * (255 - a) + 0xff) >> 8;
/* nr = gamma_table[nr];
ng = gamma_table[ng];
nb = gamma_table[nb];*/
BLEND_WRITE(dst, nr, ng, nb)
BLEND_INC(dst)
}
}
/**** ****/