mirror of
https://github.com/ZDoom/gzdoom.git
synced 2024-11-20 03:01:31 +00:00
1273 lines
36 KiB
C++
1273 lines
36 KiB
C++
/*
|
|
** r_drawt_rgba.cpp
|
|
** Faster column drawers for modern processors, true color edition
|
|
**
|
|
**---------------------------------------------------------------------------
|
|
** Copyright 1998-2006 Randy Heit
|
|
** All rights reserved.
|
|
**
|
|
** Redistribution and use in source and binary forms, with or without
|
|
** modification, are permitted provided that the following conditions
|
|
** are met:
|
|
**
|
|
** 1. Redistributions of source code must retain the above copyright
|
|
** notice, this list of conditions and the following disclaimer.
|
|
** 2. Redistributions in binary form must reproduce the above copyright
|
|
** notice, this list of conditions and the following disclaimer in the
|
|
** documentation and/or other materials provided with the distribution.
|
|
** 3. The name of the author may not be used to endorse or promote products
|
|
** derived from this software without specific prior written permission.
|
|
**
|
|
** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
**---------------------------------------------------------------------------
|
|
**
|
|
** True color versions of the similar functions in r_drawt.cpp
|
|
** Please see r_drawt.cpp for a description of the globals used.
|
|
*/
|
|
|
|
#include "templates.h"
|
|
#include "doomtype.h"
|
|
#include "doomdef.h"
|
|
#include "r_defs.h"
|
|
#include "r_draw.h"
|
|
#include "r_main.h"
|
|
#include "r_things.h"
|
|
#include "v_video.h"
|
|
#ifndef NO_SSE
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
uint32_t dc_temp_rgbabuff_rgba[MAXHEIGHT*4];
|
|
uint32_t *dc_temp_rgba;
|
|
|
|
// Defined in r_draw_t.cpp:
|
|
extern unsigned int dc_tspans[4][MAXHEIGHT];
|
|
extern unsigned int *dc_ctspan[4];
|
|
extern unsigned int *horizspan[4];
|
|
|
|
// Copies one span at hx to the screen at sx.
|
|
void rt_copy1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
if (count & 1) {
|
|
*dest = *source;
|
|
source += 4;
|
|
dest += pitch;
|
|
}
|
|
if (count & 2) {
|
|
dest[0] = source[0];
|
|
dest[pitch] = source[4];
|
|
source += 8;
|
|
dest += pitch*2;
|
|
}
|
|
if (!(count >>= 2))
|
|
return;
|
|
|
|
do {
|
|
dest[0] = source[0];
|
|
dest[pitch] = source[4];
|
|
dest[pitch*2] = source[8];
|
|
dest[pitch*3] = source[12];
|
|
source += 16;
|
|
dest += pitch*4;
|
|
} while (--count);
|
|
}
|
|
|
|
// Copies all four spans to the screen starting at sx.
|
|
void rt_copy4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
// To do: we could do this with SSE using __m128i
|
|
rt_copy1col_RGBA_c(0, sx, yl, yh);
|
|
rt_copy1col_RGBA_c(1, sx + 1, yl, yh);
|
|
rt_copy1col_RGBA_c(2, sx + 2, yl, yh);
|
|
rt_copy1col_RGBA_c(3, sx + 3, yl, yh);
|
|
}
|
|
|
|
// Maps one span at hx to the screen at sx.
|
|
void rt_map1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
if (count & 1) {
|
|
*dest = shade_pal_index(*source, light, shade_constants);
|
|
source += 4;
|
|
dest += pitch;
|
|
}
|
|
if (!(count >>= 1))
|
|
return;
|
|
|
|
do {
|
|
dest[0] = shade_pal_index(source[0], light, shade_constants);
|
|
dest[pitch] = shade_pal_index(source[4], light, shade_constants);
|
|
source += 8;
|
|
dest += pitch*2;
|
|
} while (--count);
|
|
}
|
|
|
|
// Maps all four spans to the screen starting at sx.
|
|
void rt_map4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4];
|
|
pitch = dc_pitch;
|
|
|
|
if (count & 1) {
|
|
dest[0] = shade_pal_index(source[0], light, shade_constants);
|
|
dest[1] = shade_pal_index(source[1], light, shade_constants);
|
|
dest[2] = shade_pal_index(source[2], light, shade_constants);
|
|
dest[3] = shade_pal_index(source[3], light, shade_constants);
|
|
source += 4;
|
|
dest += pitch;
|
|
}
|
|
if (!(count >>= 1))
|
|
return;
|
|
|
|
do {
|
|
dest[0] = shade_pal_index(source[0], light, shade_constants);
|
|
dest[1] = shade_pal_index(source[1], light, shade_constants);
|
|
dest[2] = shade_pal_index(source[2], light, shade_constants);
|
|
dest[3] = shade_pal_index(source[3], light, shade_constants);
|
|
dest[pitch] = shade_pal_index(source[4], light, shade_constants);
|
|
dest[pitch + 1] = shade_pal_index(source[5], light, shade_constants);
|
|
dest[pitch + 2] = shade_pal_index(source[6], light, shade_constants);
|
|
dest[pitch + 3] = shade_pal_index(source[7], light, shade_constants);
|
|
source += 8;
|
|
dest += pitch*2;
|
|
} while (--count);
|
|
}
|
|
|
|
// Maps all four spans to the screen starting at sx.
|
|
void rt_map4cols_RGBA_SSE(int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh - yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl * 4];
|
|
pitch = dc_pitch;
|
|
|
|
if (shade_constants.simple_shade)
|
|
{
|
|
SSE_SHADE_SIMPLE_INIT(light);
|
|
|
|
if (count & 1) {
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
// shade_pal_index:
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE_SIMPLE(fg);
|
|
_mm_storeu_si128((__m128i*)dest, fg);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
}
|
|
if (!(count >>= 1))
|
|
return;
|
|
|
|
do {
|
|
// shade_pal_index 0-3
|
|
{
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE_SIMPLE(fg);
|
|
_mm_storeu_si128((__m128i*)dest, fg);
|
|
}
|
|
|
|
// shade_pal_index 4-7 (pitch)
|
|
{
|
|
uint32_t p0 = source[4];
|
|
uint32_t p1 = source[5];
|
|
uint32_t p2 = source[6];
|
|
uint32_t p3 = source[7];
|
|
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE_SIMPLE(fg);
|
|
_mm_storeu_si128((__m128i*)(dest + pitch), fg);
|
|
}
|
|
|
|
source += 8;
|
|
dest += pitch * 2;
|
|
} while (--count);
|
|
}
|
|
else
|
|
{
|
|
SSE_SHADE_INIT(light, shade_constants);
|
|
|
|
if (count & 1) {
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
// shade_pal_index:
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE(fg, shade_constants);
|
|
_mm_storeu_si128((__m128i*)dest, fg);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
}
|
|
if (!(count >>= 1))
|
|
return;
|
|
|
|
do {
|
|
// shade_pal_index 0-3
|
|
{
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE(fg, shade_constants);
|
|
_mm_storeu_si128((__m128i*)dest, fg);
|
|
}
|
|
|
|
// shade_pal_index 4-7 (pitch)
|
|
{
|
|
uint32_t p0 = source[4];
|
|
uint32_t p1 = source[5];
|
|
uint32_t p2 = source[6];
|
|
uint32_t p3 = source[7];
|
|
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE(fg, shade_constants);
|
|
_mm_storeu_si128((__m128i*)(dest + pitch), fg);
|
|
}
|
|
|
|
source += 8;
|
|
dest += pitch * 2;
|
|
} while (--count);
|
|
}
|
|
}
|
|
|
|
void rt_Translate1col_RGBA_c(const BYTE *translation, int hx, int yl, int yh)
|
|
{
|
|
int count = yh - yl + 1;
|
|
uint32_t *source = &dc_temp_rgba[yl*4 + hx];
|
|
|
|
// Things we do to hit the compiler's optimizer with a clue bat:
|
|
// 1. Parallelism is explicitly spelled out by using a separate
|
|
// C instruction for each assembly instruction. GCC lets me
|
|
// have four temporaries, but VC++ spills to the stack with
|
|
// more than two. Two is probably optimal, anyway.
|
|
// 2. The results of the translation lookups are explicitly
|
|
// stored in byte-sized variables. This causes the VC++ code
|
|
// to use byte mov instructions in most cases; for apparently
|
|
// random reasons, it will use movzx for some places. GCC
|
|
// ignores this and uses movzx always.
|
|
|
|
// Do 8 rows at a time.
|
|
for (int count8 = count >> 3; count8; --count8)
|
|
{
|
|
int c0, c1;
|
|
BYTE b0, b1;
|
|
|
|
c0 = source[0]; c1 = source[4];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[0] = b0; source[4] = b1;
|
|
|
|
c0 = source[8]; c1 = source[12];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[8] = b0; source[12] = b1;
|
|
|
|
c0 = source[16]; c1 = source[20];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[16] = b0; source[20] = b1;
|
|
|
|
c0 = source[24]; c1 = source[28];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[24] = b0; source[28] = b1;
|
|
|
|
source += 32;
|
|
}
|
|
// Finish by doing 1 row at a time.
|
|
for (count &= 7; count; --count, source += 4)
|
|
{
|
|
source[0] = translation[source[0]];
|
|
}
|
|
}
|
|
|
|
void rt_Translate4cols_RGBA_c(const BYTE *translation, int yl, int yh)
|
|
{
|
|
int count = yh - yl + 1;
|
|
uint32_t *source = &dc_temp_rgba[yl*4];
|
|
int c0, c1;
|
|
BYTE b0, b1;
|
|
|
|
// Do 2 rows at a time.
|
|
for (int count8 = count >> 1; count8; --count8)
|
|
{
|
|
c0 = source[0]; c1 = source[1];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[0] = b0; source[1] = b1;
|
|
|
|
c0 = source[2]; c1 = source[3];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[2] = b0; source[3] = b1;
|
|
|
|
c0 = source[4]; c1 = source[5];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[4] = b0; source[5] = b1;
|
|
|
|
c0 = source[6]; c1 = source[7];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[6] = b0; source[7] = b1;
|
|
|
|
source += 8;
|
|
}
|
|
// Do the final row if count was odd.
|
|
if (count & 1)
|
|
{
|
|
c0 = source[0]; c1 = source[1];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[0] = b0; source[1] = b1;
|
|
|
|
c0 = source[2]; c1 = source[3];
|
|
b0 = translation[c0]; b1 = translation[c1];
|
|
source[2] = b0; source[3] = b1;
|
|
}
|
|
}
|
|
|
|
// Translates one span at hx to the screen at sx.
|
|
void rt_tlate1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
|
|
rt_map1col(hx, sx, yl, yh);
|
|
}
|
|
|
|
// Translates all four spans to the screen starting at sx.
|
|
void rt_tlate4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
|
|
rt_map4cols(sx, yl, yh);
|
|
}
|
|
|
|
// Adds one span at hx to the screen at sx without clamping.
|
|
void rt_add1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
uint32_t fg = shade_pal_index(*source, light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (*dest >> 16) & 0xff;
|
|
uint32_t bg_green = (*dest >> 8) & 0xff;
|
|
uint32_t bg_blue = (*dest) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
|
|
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
|
|
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
|
|
|
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Adds all four spans to the screen starting at sx without clamping.
|
|
void rt_add4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t fg = shade_pal_index(source[i], light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (dest[i] >> 16) & 0xff;
|
|
uint32_t bg_green = (dest[i] >> 8) & 0xff;
|
|
uint32_t bg_blue = (dest[i]) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
|
|
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
|
|
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
|
|
|
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
}
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Adds all four spans to the screen starting at sx without clamping.
|
|
#ifndef NO_SSE
|
|
void rt_add4cols_RGBA_SSE(int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh - yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl * 4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
if (shade_constants.simple_shade)
|
|
{
|
|
SSE_SHADE_SIMPLE_INIT(light);
|
|
|
|
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
|
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
|
|
|
do {
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
// shade_pal_index:
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE_SIMPLE(fg);
|
|
|
|
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
|
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
|
|
|
// unpack bg:
|
|
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
|
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
|
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
|
|
|
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
|
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
|
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
|
|
|
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
|
_mm_storeu_si128((__m128i*)dest, color);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
else
|
|
{
|
|
SSE_SHADE_INIT(light, shade_constants);
|
|
|
|
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
|
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
|
|
|
do {
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
// shade_pal_index:
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE(fg, shade_constants);
|
|
|
|
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
|
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
|
|
|
// unpack bg:
|
|
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
|
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
|
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
|
|
|
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
|
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
|
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
|
|
|
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
|
_mm_storeu_si128((__m128i*)dest, color);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Translates and adds one span at hx to the screen at sx without clamping.
|
|
void rt_tlateadd1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
|
|
rt_add1col(hx, sx, yl, yh);
|
|
}
|
|
|
|
// Translates and adds all four spans to the screen starting at sx without clamping.
|
|
void rt_tlateadd4cols_RGBA_c(int sx, int yl, int yh)
|
|
{
|
|
rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
|
|
rt_add4cols(sx, yl, yh);
|
|
}
|
|
|
|
// Shades one span at hx to the screen at sx.
|
|
void rt_shaded1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
BYTE *colormap;
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
colormap = dc_colormap;
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light));
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
do {
|
|
uint32_t alpha = colormap[*source];
|
|
uint32_t inv_alpha = 64 - alpha;
|
|
|
|
uint32_t bg_red = (*dest >> 16) & 0xff;
|
|
uint32_t bg_green = (*dest >> 8) & 0xff;
|
|
uint32_t bg_blue = (*dest) & 0xff;
|
|
|
|
uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64;
|
|
uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64;
|
|
uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64;
|
|
|
|
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Shades all four spans to the screen starting at sx.
|
|
void rt_shaded4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
BYTE *colormap;
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
colormap = dc_colormap;
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light));
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
do {
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t alpha = colormap[source[i]];
|
|
uint32_t inv_alpha = 64 - alpha;
|
|
|
|
uint32_t bg_red = (dest[i] >> 16) & 0xff;
|
|
uint32_t bg_green = (dest[i] >> 8) & 0xff;
|
|
uint32_t bg_blue = (dest[i]) & 0xff;
|
|
|
|
uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64;
|
|
uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64;
|
|
uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64;
|
|
|
|
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
}
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Shades all four spans to the screen starting at sx.
|
|
#ifndef NO_SSE
|
|
void rt_shaded4cols_RGBA_SSE(int sx, int yl, int yh)
|
|
{
|
|
BYTE *colormap;
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh - yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
colormap = dc_colormap;
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl * 4];
|
|
pitch = dc_pitch;
|
|
|
|
__m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light))), _mm_setzero_si128());
|
|
__m128i alpha_one = _mm_set1_epi16(64);
|
|
|
|
do {
|
|
uint32_t p0 = colormap[source[0]];
|
|
uint32_t p1 = colormap[source[1]];
|
|
uint32_t p2 = colormap[source[2]];
|
|
uint32_t p3 = colormap[source[3]];
|
|
|
|
__m128i alpha_hi = _mm_set_epi16(64, p3, p3, p3, 64, p2, p2, p2);
|
|
__m128i alpha_lo = _mm_set_epi16(64, p1, p1, p1, 64, p0, p0, p0);
|
|
__m128i inv_alpha_hi = _mm_subs_epu16(alpha_one, alpha_hi);
|
|
__m128i inv_alpha_lo = _mm_subs_epu16(alpha_one, alpha_lo);
|
|
|
|
// unpack bg:
|
|
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
|
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
|
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
|
|
|
// (fg_red * alpha + bg_red * inv_alpha) / 64:
|
|
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_hi), _mm_mullo_epi16(bg_hi, inv_alpha_hi)), 6);
|
|
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_lo), _mm_mullo_epi16(bg_lo, inv_alpha_lo)), 6);
|
|
|
|
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
|
_mm_storeu_si128((__m128i*)dest, color);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
#endif
|
|
|
|
// Adds one span at hx to the screen at sx with clamping.
|
|
void rt_addclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
uint32_t fg = shade_pal_index(*source, light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (*dest >> 16) & 0xff;
|
|
uint32_t bg_green = (*dest >> 8) & 0xff;
|
|
uint32_t bg_blue = (*dest) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
|
|
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
|
|
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
|
|
|
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Adds all four spans to the screen starting at sx with clamping.
|
|
void rt_addclamp4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t fg = shade_pal_index(source[i], light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (dest[i] >> 16) & 0xff;
|
|
uint32_t bg_green = (dest[i] >> 8) & 0xff;
|
|
uint32_t bg_blue = (dest[i]) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
|
|
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
|
|
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
|
|
|
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
}
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Adds all four spans to the screen starting at sx with clamping.
|
|
#ifndef NO_SSE
|
|
void rt_addclamp4cols_RGBA_SSE(int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh - yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl * 4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
if (shade_constants.simple_shade)
|
|
{
|
|
SSE_SHADE_SIMPLE_INIT(light);
|
|
|
|
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
|
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
|
|
|
do {
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
// shade_pal_index:
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE_SIMPLE(fg);
|
|
|
|
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
|
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
|
|
|
// unpack bg:
|
|
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
|
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
|
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
|
|
|
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
|
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
|
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
|
|
|
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
|
_mm_storeu_si128((__m128i*)dest, color);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
else
|
|
{
|
|
SSE_SHADE_INIT(light, shade_constants);
|
|
|
|
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
|
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
|
|
|
do {
|
|
uint32_t p0 = source[0];
|
|
uint32_t p1 = source[1];
|
|
uint32_t p2 = source[2];
|
|
uint32_t p3 = source[3];
|
|
|
|
// shade_pal_index:
|
|
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
|
SSE_SHADE(fg, shade_constants);
|
|
|
|
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
|
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
|
|
|
// unpack bg:
|
|
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
|
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
|
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
|
|
|
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
|
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
|
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
|
|
|
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
|
_mm_storeu_si128((__m128i*)dest, color);
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Translates and adds one span at hx to the screen at sx with clamping.
|
|
void rt_tlateaddclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
|
|
rt_addclamp1col_RGBA_c(hx, sx, yl, yh);
|
|
}
|
|
|
|
// Translates and adds all four spans to the screen starting at sx with clamping.
|
|
void rt_tlateaddclamp4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
|
|
rt_addclamp4cols(sx, yl, yh);
|
|
}
|
|
|
|
// Subtracts one span at hx to the screen at sx with clamping.
|
|
void rt_subclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
uint32_t fg = shade_pal_index(*source, light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (*dest >> 16) & 0xff;
|
|
uint32_t bg_green = (*dest >> 8) & 0xff;
|
|
uint32_t bg_blue = (*dest) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
|
|
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Subtracts all four spans to the screen starting at sx with clamping.
|
|
void rt_subclamp4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t fg = shade_pal_index(source[i], light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (dest[i] >> 16) & 0xff;
|
|
uint32_t bg_green = (dest[i] >> 8) & 0xff;
|
|
uint32_t bg_blue = (dest[i]) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
|
|
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
}
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Translates and subtracts one span at hx to the screen at sx with clamping.
|
|
void rt_tlatesubclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
|
|
rt_subclamp1col_RGBA_c(hx, sx, yl, yh);
|
|
}
|
|
|
|
// Translates and subtracts all four spans to the screen starting at sx with clamping.
|
|
void rt_tlatesubclamp4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
|
|
rt_subclamp4cols_RGBA_c(sx, yl, yh);
|
|
}
|
|
|
|
// Subtracts one span at hx from the screen at sx with clamping.
|
|
void rt_revsubclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4 + hx];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
uint32_t fg = shade_pal_index(*source, light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (*dest >> 16) & 0xff;
|
|
uint32_t bg_green = (*dest >> 8) & 0xff;
|
|
uint32_t bg_blue = (*dest) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
|
|
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Subtracts all four spans from the screen starting at sx with clamping.
|
|
void rt_revsubclamp4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
uint32_t *source;
|
|
uint32_t *dest;
|
|
int count;
|
|
int pitch;
|
|
|
|
count = yh-yl;
|
|
if (count < 0)
|
|
return;
|
|
count++;
|
|
|
|
dest = ylookup[yl] + sx + (uint32_t*)dc_destorg;
|
|
source = &dc_temp_rgba[yl*4];
|
|
pitch = dc_pitch;
|
|
|
|
uint32_t light = calc_light_multiplier(dc_light);
|
|
ShadeConstants shade_constants = dc_shade_constants;
|
|
|
|
uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8);
|
|
uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8);
|
|
|
|
do {
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t fg = shade_pal_index(source[i], light, shade_constants);
|
|
uint32_t fg_red = (fg >> 16) & 0xff;
|
|
uint32_t fg_green = (fg >> 8) & 0xff;
|
|
uint32_t fg_blue = fg & 0xff;
|
|
|
|
uint32_t bg_red = (dest[i] >> 16) & 0xff;
|
|
uint32_t bg_green = (dest[i] >> 8) & 0xff;
|
|
uint32_t bg_blue = (dest[i]) & 0xff;
|
|
|
|
uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
|
|
|
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
|
}
|
|
|
|
source += 4;
|
|
dest += pitch;
|
|
} while (--count);
|
|
}
|
|
|
|
// Translates and subtracts one span at hx from the screen at sx with clamping.
|
|
void rt_tlaterevsubclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
|
|
{
|
|
rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
|
|
rt_revsubclamp1col_RGBA_c(hx, sx, yl, yh);
|
|
}
|
|
|
|
// Translates and subtracts all four spans from the screen starting at sx with clamping.
|
|
void rt_tlaterevsubclamp4cols_RGBA_c (int sx, int yl, int yh)
|
|
{
|
|
rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
|
|
rt_revsubclamp4cols_RGBA_c(sx, yl, yh);
|
|
}
|
|
|
|
// Before each pass through a rendering loop that uses these routines,
|
|
// call this function to set up the span pointers.
|
|
void rt_initcols_rgba (BYTE *buff)
|
|
{
|
|
int y;
|
|
|
|
dc_temp_rgba = buff == NULL ? dc_temp_rgbabuff_rgba : (uint32_t*)buff;
|
|
for (y = 3; y >= 0; y--)
|
|
horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0];
|
|
}
|
|
|
|
// Stretches a column into a temporary buffer which is later
|
|
// drawn to the screen along with up to three other columns.
|
|
void R_DrawColumnHorizP_RGBA_C (void)
|
|
{
|
|
int count = dc_count;
|
|
uint32_t *dest;
|
|
fixed_t fracstep;
|
|
fixed_t frac;
|
|
|
|
if (count <= 0)
|
|
return;
|
|
|
|
{
|
|
int x = dc_x & 3;
|
|
unsigned int **span;
|
|
|
|
span = &dc_ctspan[x];
|
|
(*span)[0] = dc_yl;
|
|
(*span)[1] = dc_yh;
|
|
*span += 2;
|
|
dest = &dc_temp_rgba[x + 4*dc_yl];
|
|
}
|
|
fracstep = dc_iscale;
|
|
frac = dc_texturefrac;
|
|
|
|
{
|
|
const BYTE *source = dc_source;
|
|
|
|
if (count & 1) {
|
|
*dest = source[frac>>FRACBITS]; dest += 4; frac += fracstep;
|
|
}
|
|
if (count & 2) {
|
|
dest[0] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[4] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest += 8;
|
|
}
|
|
if (count & 4) {
|
|
dest[0] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[4] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[8] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[12]= source[frac>>FRACBITS]; frac += fracstep;
|
|
dest += 16;
|
|
}
|
|
count >>= 3;
|
|
if (!count) return;
|
|
|
|
do
|
|
{
|
|
dest[0] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[4] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[8] = source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[12]= source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[16]= source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[20]= source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[24]= source[frac>>FRACBITS]; frac += fracstep;
|
|
dest[28]= source[frac>>FRACBITS]; frac += fracstep;
|
|
dest += 32;
|
|
} while (--count);
|
|
}
|
|
}
|
|
|
|
// [RH] Just fills a column with a given color
|
|
void R_FillColumnHorizP_RGBA_C (void)
|
|
{
|
|
int count = dc_count;
|
|
BYTE color = dc_color;
|
|
uint32_t *dest;
|
|
|
|
if (count <= 0)
|
|
return;
|
|
|
|
{
|
|
int x = dc_x & 3;
|
|
unsigned int **span = &dc_ctspan[x];
|
|
|
|
(*span)[0] = dc_yl;
|
|
(*span)[1] = dc_yh;
|
|
*span += 2;
|
|
dest = &dc_temp_rgba[x + 4*dc_yl];
|
|
}
|
|
|
|
if (count & 1) {
|
|
*dest = color;
|
|
dest += 4;
|
|
}
|
|
if (!(count >>= 1))
|
|
return;
|
|
do {
|
|
dest[0] = color; dest[4] = color;
|
|
dest += 8;
|
|
} while (--count);
|
|
}
|