ioq3quest/code/client/snd_altivec.c
Simon McVittie 5909b9a1cf Isolate the Altivec code so non-Altivec PPC targets can use the same binary.
Moved all the code using Altivec intrinsics to separate files. This 
means we can optionally use GCC's -maltivec on just these files, which
are chosen at runtime if the CPU supports Altivec, and compile the rest
without it, making a single binary that has Altivec optimizations but
can still work on G3.

Unlike SSE and similar extensions on x86, there does not seem to be
a way to enable conditional, targeted use of Altivec based on runtime
detection (which is what ioquake3 wants to do) without also giving the
compiler permission to use Altivec in code generation; so to not crash
on CPUs that do not implement Altivec, we'll have to turn it off
altogether, except in translation units that are only entered when
runtime Altivec detection is successful.

This has been tested on Linux PPC (on an Altivec-enabled CPU),
but we may need further work after testing trickles out to other
PowerPC devices and ancient Mac OS X builds.

I did a little work on this patch, but the majority of the effort belongs 
to Simon McVittie (thanks!).
2018-05-12 14:14:47 -04:00

229 lines
7 KiB
C

/*
===========================================================================
Copyright (C) 1999-2005 Id Software, Inc.
This file is part of Quake III Arena source code.
Quake III Arena source code is free software; you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Quake III Arena source code is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Quake III Arena source code; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
===========================================================================
*/
/* This file is only compiled for PowerPC builds with Altivec support.
Altivec intrinsics need to be in a separate file, so GCC's -maltivec
command line can enable them, but give us the option to _not_ use that
on other files, where the compiler might then generate Altivec
instructions for normal floating point, crashing on G3 (etc) processors. */
#include "client.h"
#include "snd_local.h"
#if idppc_altivec
#if !defined(__APPLE__)
#include <altivec.h>
#endif
void S_PaintChannelFrom16_altivec( portable_samplepair_t paintbuffer[PAINTBUFFER_SIZE], int snd_vol, channel_t *ch, const sfx_t *sc, int count, int sampleOffset, int bufferOffset ) {
int data, aoff, boff;
int leftvol, rightvol;
int i, j;
portable_samplepair_t *samp;
sndBuffer *chunk;
short *samples;
float ooff, fdata[2], fdiv, fleftvol, frightvol;
if (sc->soundChannels <= 0) {
return;
}
samp = &paintbuffer[ bufferOffset ];
if (ch->doppler) {
sampleOffset = sampleOffset*ch->oldDopplerScale;
}
if ( sc->soundChannels == 2 ) {
sampleOffset *= sc->soundChannels;
if ( sampleOffset & 1 ) {
sampleOffset &= ~1;
}
}
chunk = sc->soundData;
while (sampleOffset>=SND_CHUNK_SIZE) {
chunk = chunk->next;
sampleOffset -= SND_CHUNK_SIZE;
if (!chunk) {
chunk = sc->soundData;
}
}
if (!ch->doppler || ch->dopplerScale==1.0f) {
vector signed short volume_vec;
vector unsigned int volume_shift;
int vectorCount, samplesLeft, chunkSamplesLeft;
leftvol = ch->leftvol*snd_vol;
rightvol = ch->rightvol*snd_vol;
samples = chunk->sndChunk;
((short *)&volume_vec)[0] = leftvol;
((short *)&volume_vec)[1] = leftvol;
((short *)&volume_vec)[4] = leftvol;
((short *)&volume_vec)[5] = leftvol;
((short *)&volume_vec)[2] = rightvol;
((short *)&volume_vec)[3] = rightvol;
((short *)&volume_vec)[6] = rightvol;
((short *)&volume_vec)[7] = rightvol;
volume_shift = vec_splat_u32(8);
i = 0;
while(i < count) {
/* Try to align destination to 16-byte boundary */
while(i < count && (((unsigned long)&samp[i] & 0x1f) || ((count-i) < 8) || ((SND_CHUNK_SIZE - sampleOffset) < 8))) {
data = samples[sampleOffset++];
samp[i].left += (data * leftvol)>>8;
if ( sc->soundChannels == 2 ) {
data = samples[sampleOffset++];
}
samp[i].right += (data * rightvol)>>8;
if (sampleOffset == SND_CHUNK_SIZE) {
chunk = chunk->next;
samples = chunk->sndChunk;
sampleOffset = 0;
}
i++;
}
/* Destination is now aligned. Process as many 8-sample
chunks as we can before we run out of room from the current
sound chunk. We do 8 per loop to avoid extra source data reads. */
samplesLeft = count - i;
chunkSamplesLeft = SND_CHUNK_SIZE - sampleOffset;
if(samplesLeft > chunkSamplesLeft)
samplesLeft = chunkSamplesLeft;
vectorCount = samplesLeft / 8;
if(vectorCount)
{
vector unsigned char tmp;
vector short s0, s1, sampleData0, sampleData1;
vector signed int merge0, merge1;
vector signed int d0, d1, d2, d3;
vector unsigned char samplePermute0 =
VECCONST_UINT8(0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7);
vector unsigned char samplePermute1 =
VECCONST_UINT8(8, 9, 12, 13, 8, 9, 12, 13, 10, 11, 14, 15, 10, 11, 14, 15);
vector unsigned char loadPermute0, loadPermute1;
// Rather than permute the vectors after we load them to do the sample
// replication and rearrangement, we permute the alignment vector so
// we do everything in one step below and avoid data shuffling.
tmp = vec_lvsl(0,&samples[sampleOffset]);
loadPermute0 = vec_perm(tmp,tmp,samplePermute0);
loadPermute1 = vec_perm(tmp,tmp,samplePermute1);
s0 = *(vector short *)&samples[sampleOffset];
while(vectorCount)
{
/* Load up source (16-bit) sample data */
s1 = *(vector short *)&samples[sampleOffset+7];
/* Load up destination sample data */
d0 = *(vector signed int *)&samp[i];
d1 = *(vector signed int *)&samp[i+2];
d2 = *(vector signed int *)&samp[i+4];
d3 = *(vector signed int *)&samp[i+6];
sampleData0 = vec_perm(s0,s1,loadPermute0);
sampleData1 = vec_perm(s0,s1,loadPermute1);
merge0 = vec_mule(sampleData0,volume_vec);
merge0 = vec_sra(merge0,volume_shift); /* Shift down to proper range */
merge1 = vec_mulo(sampleData0,volume_vec);
merge1 = vec_sra(merge1,volume_shift);
d0 = vec_add(merge0,d0);
d1 = vec_add(merge1,d1);
merge0 = vec_mule(sampleData1,volume_vec);
merge0 = vec_sra(merge0,volume_shift); /* Shift down to proper range */
merge1 = vec_mulo(sampleData1,volume_vec);
merge1 = vec_sra(merge1,volume_shift);
d2 = vec_add(merge0,d2);
d3 = vec_add(merge1,d3);
/* Store destination sample data */
*(vector signed int *)&samp[i] = d0;
*(vector signed int *)&samp[i+2] = d1;
*(vector signed int *)&samp[i+4] = d2;
*(vector signed int *)&samp[i+6] = d3;
i += 8;
vectorCount--;
s0 = s1;
sampleOffset += 8;
}
if (sampleOffset == SND_CHUNK_SIZE) {
chunk = chunk->next;
samples = chunk->sndChunk;
sampleOffset = 0;
}
}
}
} else {
fleftvol = ch->leftvol*snd_vol;
frightvol = ch->rightvol*snd_vol;
ooff = sampleOffset;
samples = chunk->sndChunk;
for ( i=0 ; i<count ; i++ ) {
aoff = ooff;
ooff = ooff + ch->dopplerScale * sc->soundChannels;
boff = ooff;
fdata[0] = fdata[1] = 0;
for (j=aoff; j<boff; j += sc->soundChannels) {
if (j == SND_CHUNK_SIZE) {
chunk = chunk->next;
if (!chunk) {
chunk = sc->soundData;
}
samples = chunk->sndChunk;
ooff -= SND_CHUNK_SIZE;
}
if ( sc->soundChannels == 2 ) {
fdata[0] += samples[j&(SND_CHUNK_SIZE-1)];
fdata[1] += samples[(j+1)&(SND_CHUNK_SIZE-1)];
} else {
fdata[0] += samples[j&(SND_CHUNK_SIZE-1)];
fdata[1] += samples[j&(SND_CHUNK_SIZE-1)];
}
}
fdiv = 256 * (boff-aoff) / sc->soundChannels;
samp[i].left += (fdata[0] * fleftvol)/fdiv;
samp[i].right += (fdata[1] * frightvol)/fdiv;
}
}
}
#endif