VP8: unroll 3 planes -> packed conversion loop.

On an AMD Phenom II X4 system with generic memory modules, this brings down
the mean time for this conversion from 16.5 to 10.5 ms.
(GCC 4.6.1, optimized build)

git-svn-id: https://svn.eduke32.com/eduke32@2830 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
helixhorned 2012-07-13 18:20:43 +00:00
parent cd1401fb52
commit 43fe858652

View file

@ -296,13 +296,29 @@ read_ivf_frame:
int32_t x, y;
const int32_t width=img->d_w, height = img->d_h;
for (y=0; y<height; y++)
for (x=0; x<width; x++)
for (y=0; y<height; y+=2)
{
for (x=0; x<width; x+=2)
{
uint8_t u = uplane[ustride*(y>>1) + (x>>1)];
uint8_t v = vplane[vstride*(y>>1) + (x>>1)];
dstpic[(width*y + x)<<2] = yplane[ystride*y + x];
dstpic[((width*y + x)<<2) + 1] = uplane[ustride*(y>>1) + (x>>1)];
dstpic[((width*y + x)<<2) + 2] = vplane[vstride*(y>>1) + (x>>1)];
dstpic[(width*y + x+1)<<2] = yplane[ystride*y + x+1];
dstpic[(width*(y+1) + x)<<2] = yplane[ystride*(y+1) + x];
dstpic[(width*(y+1) + x+1)<<2] = yplane[ystride*(y+1) + x+1];
dstpic[((width*y + x)<<2) + 1] = u;
dstpic[((width*y + x+1)<<2) + 1] = u;
dstpic[((width*(y+1) + x)<<2) + 1] = u;
dstpic[((width*(y+1) + x+1)<<2) + 1] = u;
dstpic[((width*y + x)<<2) + 2] = v;
dstpic[((width*y + x+1)<<2) + 2] = v;
dstpic[((width*(y+1) + x)<<2) + 2] = v;
dstpic[((width*(y+1) + x+1)<<2) + 2] = v;
}
}
// initprintf("%d ms\n", getticks()-t);
}