mirror of
https://github.com/ZDoom/Raze.git
synced 2025-01-18 06:32:37 +00:00
VP8: unroll 3 planes -> packed conversion loop.
On an AMD Phenom II X4 system with generic memory modules, this brings down the mean time for this conversion from 16.5 to 10.5 ms. (GCC 4.6.1, optimized build) git-svn-id: https://svn.eduke32.com/eduke32@2830 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
parent
cd1401fb52
commit
43fe858652
1 changed files with 20 additions and 4 deletions
|
@ -296,13 +296,29 @@ read_ivf_frame:
|
|||
int32_t x, y;
|
||||
const int32_t width=img->d_w, height = img->d_h;
|
||||
|
||||
for (y=0; y<height; y++)
|
||||
for (x=0; x<width; x++)
|
||||
for (y=0; y<height; y+=2)
|
||||
{
|
||||
for (x=0; x<width; x+=2)
|
||||
{
|
||||
uint8_t u = uplane[ustride*(y>>1) + (x>>1)];
|
||||
uint8_t v = vplane[vstride*(y>>1) + (x>>1)];
|
||||
|
||||
dstpic[(width*y + x)<<2] = yplane[ystride*y + x];
|
||||
dstpic[((width*y + x)<<2) + 1] = uplane[ustride*(y>>1) + (x>>1)];
|
||||
dstpic[((width*y + x)<<2) + 2] = vplane[vstride*(y>>1) + (x>>1)];
|
||||
dstpic[(width*y + x+1)<<2] = yplane[ystride*y + x+1];
|
||||
dstpic[(width*(y+1) + x)<<2] = yplane[ystride*(y+1) + x];
|
||||
dstpic[(width*(y+1) + x+1)<<2] = yplane[ystride*(y+1) + x+1];
|
||||
|
||||
dstpic[((width*y + x)<<2) + 1] = u;
|
||||
dstpic[((width*y + x+1)<<2) + 1] = u;
|
||||
dstpic[((width*(y+1) + x)<<2) + 1] = u;
|
||||
dstpic[((width*(y+1) + x+1)<<2) + 1] = u;
|
||||
|
||||
dstpic[((width*y + x)<<2) + 2] = v;
|
||||
dstpic[((width*y + x+1)<<2) + 2] = v;
|
||||
dstpic[((width*(y+1) + x)<<2) + 2] = v;
|
||||
dstpic[((width*(y+1) + x+1)<<2) + 2] = v;
|
||||
}
|
||||
}
|
||||
|
||||
// initprintf("%d ms\n", getticks()-t);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue