From 43fe8586521058811e6cfb75ba7a6a9ba7aea666 Mon Sep 17 00:00:00 2001 From: helixhorned Date: Fri, 13 Jul 2012 18:20:43 +0000 Subject: [PATCH] VP8: unroll 3 planes -> packed conversion loop. On an AMD Phenom II X4 system with generic memory modules, this brings down the mean time for this conversion from 16.5 to 10.5 ms. (GCC 4.6.1, optimized build) git-svn-id: https://svn.eduke32.com/eduke32@2830 1a8010ca-5511-0410-912e-c29ae57300e0 --- polymer/eduke32/source/animvpx.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/polymer/eduke32/source/animvpx.c b/polymer/eduke32/source/animvpx.c index 5b6231df5..02e14e287 100644 --- a/polymer/eduke32/source/animvpx.c +++ b/polymer/eduke32/source/animvpx.c @@ -296,13 +296,29 @@ read_ivf_frame: int32_t x, y; const int32_t width=img->d_w, height = img->d_h; - for (y=0; y>1) + (x>>1)]; + uint8_t v = vplane[vstride*(y>>1) + (x>>1)]; + dstpic[(width*y + x)<<2] = yplane[ystride*y + x]; - dstpic[((width*y + x)<<2) + 1] = uplane[ustride*(y>>1) + (x>>1)]; - dstpic[((width*y + x)<<2) + 2] = vplane[vstride*(y>>1) + (x>>1)]; + dstpic[(width*y + x+1)<<2] = yplane[ystride*y + x+1]; + dstpic[(width*(y+1) + x)<<2] = yplane[ystride*(y+1) + x]; + dstpic[(width*(y+1) + x+1)<<2] = yplane[ystride*(y+1) + x+1]; + + dstpic[((width*y + x)<<2) + 1] = u; + dstpic[((width*y + x+1)<<2) + 1] = u; + dstpic[((width*(y+1) + x)<<2) + 1] = u; + dstpic[((width*(y+1) + x+1)<<2) + 1] = u; + + dstpic[((width*y + x)<<2) + 2] = v; + dstpic[((width*y + x+1)<<2) + 2] = v; + dstpic[((width*(y+1) + x)<<2) + 2] = v; + dstpic[((width*(y+1) + x+1)<<2) + 2] = v; } + } // initprintf("%d ms\n", getticks()-t); }