q3rally/engine/code/client/snd_altivec.c

/*
===========================================================================
Copyright (C) 1999-2005 Id Software, Inc.

This file is part of Quake III Arena source code.

Quake III Arena source code is free software; you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Quake III Arena source code is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Quake III Arena source code; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
===========================================================================
*/

/* This file is only compiled for PowerPC builds with Altivec support.
   Altivec intrinsics need to be in a separate file, so GCC's -maltivec
   command line can enable them, but give us the option to _not_ use that
   on other files, where the compiler might then generate Altivec
   instructions for normal floating point, crashing on G3 (etc) processors. */

#include "client.h"
#include "snd_local.h"

#if idppc_altivec

#if !defined(__APPLE__)
#include <altivec.h>
#endif

void S_PaintChannelFrom16_altivec( portable_samplepair_t paintbuffer[PAINTBUFFER_SIZE], int snd_vol, channel_t *ch, const sfx_t *sc, int count, int sampleOffset, int bufferOffset ) {
	int						data, aoff, boff;
	int						leftvol, rightvol;
	int						i, j;
	portable_samplepair_t	*samp;
	sndBuffer				*chunk;
	short					*samples;
	float					ooff, fdata[2], fdiv, fleftvol, frightvol;

	if (sc->soundChannels <= 0) {
		return;
	}

	samp = &paintbuffer[ bufferOffset ];

	if (ch->doppler) {
		sampleOffset = sampleOffset*ch->oldDopplerScale;
	}

	if ( sc->soundChannels == 2 ) {
		sampleOffset *= sc->soundChannels;

		if ( sampleOffset & 1 ) {
			sampleOffset &= ~1;
		}
	}

	chunk = sc->soundData;
	while (sampleOffset>=SND_CHUNK_SIZE) {
		chunk = chunk->next;
		sampleOffset -= SND_CHUNK_SIZE;
		if (!chunk) {
			chunk = sc->soundData;
		}
	}

	if (!ch->doppler || ch->dopplerScale==1.0f) {
		vector signed short volume_vec;
		vector unsigned int volume_shift;
		int vectorCount, samplesLeft, chunkSamplesLeft;
		leftvol = ch->leftvol*snd_vol;
		rightvol = ch->rightvol*snd_vol;
		samples = chunk->sndChunk;
		((short *)&volume_vec)[0] = leftvol;
		((short *)&volume_vec)[1] = leftvol;
		((short *)&volume_vec)[4] = leftvol;
		((short *)&volume_vec)[5] = leftvol;
		((short *)&volume_vec)[2] = rightvol;
		((short *)&volume_vec)[3] = rightvol;
		((short *)&volume_vec)[6] = rightvol;
		((short *)&volume_vec)[7] = rightvol;
		volume_shift = vec_splat_u32(8);
		i = 0;

		while(i < count) {
			/* Try to align destination to 16-byte boundary */
			while(i < count && (((unsigned long)&samp[i] & 0x1f) || ((count-i) < 8) || ((SND_CHUNK_SIZE - sampleOffset) < 8))) {
				data  = samples[sampleOffset++];
				samp[i].left += (data * leftvol)>>8;

				if ( sc->soundChannels == 2 ) {
					data = samples[sampleOffset++];
				}
				samp[i].right += (data * rightvol)>>8;
	
				if (sampleOffset == SND_CHUNK_SIZE) {
					chunk = chunk->next;
					samples = chunk->sndChunk;
					sampleOffset = 0;
				}
				i++;
			}
			/* Destination is now aligned.  Process as many 8-sample 
			   chunks as we can before we run out of room from the current
			   sound chunk.  We do 8 per loop to avoid extra source data reads. */
			samplesLeft = count - i;
			chunkSamplesLeft = SND_CHUNK_SIZE - sampleOffset;
			if(samplesLeft > chunkSamplesLeft)
				samplesLeft = chunkSamplesLeft;
			
			vectorCount = samplesLeft / 8;
			
			if(vectorCount)
			{
				vector unsigned char tmp;
				vector short s0, s1, sampleData0, sampleData1;
				vector signed int merge0, merge1;
				vector signed int d0, d1, d2, d3;				
				vector unsigned char samplePermute0 =
					VECCONST_UINT8(0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7);
				vector unsigned char samplePermute1 = 
					VECCONST_UINT8(8, 9, 12, 13, 8, 9, 12, 13, 10, 11, 14, 15, 10, 11, 14, 15);
				vector unsigned char loadPermute0, loadPermute1;
				
				// Rather than permute the vectors after we load them to do the sample
				// replication and rearrangement, we permute the alignment vector so
				// we do everything in one step below and avoid data shuffling.
				tmp = vec_lvsl(0,&samples[sampleOffset]);								
				loadPermute0 = vec_perm(tmp,tmp,samplePermute0);
				loadPermute1 = vec_perm(tmp,tmp,samplePermute1);
				
				s0 = *(vector short *)&samples[sampleOffset];
				while(vectorCount)
				{
					/* Load up source (16-bit) sample data */
					s1 = *(vector short *)&samples[sampleOffset+7];
					
					/* Load up destination sample data */
					d0 = *(vector signed int *)&samp[i];
					d1 = *(vector signed int *)&samp[i+2];
					d2 = *(vector signed int *)&samp[i+4];
					d3 = *(vector signed int *)&samp[i+6];

					sampleData0 = vec_perm(s0,s1,loadPermute0);
					sampleData1 = vec_perm(s0,s1,loadPermute1);
					
					merge0 = vec_mule(sampleData0,volume_vec);
					merge0 = vec_sra(merge0,volume_shift);	/* Shift down to proper range */
					
					merge1 = vec_mulo(sampleData0,volume_vec);
					merge1 = vec_sra(merge1,volume_shift);
					
					d0 = vec_add(merge0,d0);
					d1 = vec_add(merge1,d1);
					
					merge0 = vec_mule(sampleData1,volume_vec);
					merge0 = vec_sra(merge0,volume_shift);	/* Shift down to proper range */
					
					merge1 = vec_mulo(sampleData1,volume_vec);
					merge1 = vec_sra(merge1,volume_shift);					

					d2 = vec_add(merge0,d2);
					d3 = vec_add(merge1,d3);

					/* Store destination sample data */
					*(vector signed int *)&samp[i] = d0;
					*(vector signed int *)&samp[i+2] = d1;
					*(vector signed int *)&samp[i+4] = d2;
					*(vector signed int *)&samp[i+6] = d3;

					i += 8;
					vectorCount--;
					s0 = s1;
					sampleOffset += 8;
				}
				if (sampleOffset == SND_CHUNK_SIZE) {
					chunk = chunk->next;
					samples = chunk->sndChunk;
					sampleOffset = 0;
				}
			}
		}
	} else {
		fleftvol = ch->leftvol*snd_vol;
		frightvol = ch->rightvol*snd_vol;

		ooff = sampleOffset;
		samples = chunk->sndChunk;
		
		for ( i=0 ; i<count ; i++ ) {

			aoff = ooff;
			ooff = ooff + ch->dopplerScale * sc->soundChannels;
			boff = ooff;
			fdata[0] = fdata[1] = 0;
			for (j=aoff; j<boff; j += sc->soundChannels) {
				if (j == SND_CHUNK_SIZE) {
					chunk = chunk->next;
					if (!chunk) {
						chunk = sc->soundData;
					}
					samples = chunk->sndChunk;
					ooff -= SND_CHUNK_SIZE;
				}
				if ( sc->soundChannels == 2 ) {
					fdata[0] += samples[j&(SND_CHUNK_SIZE-1)];
					fdata[1] += samples[(j+1)&(SND_CHUNK_SIZE-1)];
				} else {
					fdata[0] += samples[j&(SND_CHUNK_SIZE-1)];
					fdata[1] += samples[j&(SND_CHUNK_SIZE-1)];
				}
			}
			fdiv = 256 * (boff-aoff) / sc->soundChannels;
			samp[i].left += (fdata[0] * fleftvol)/fdiv;
			samp[i].right += (fdata[1] * frightvol)/fdiv;
		}
	}
}


#endif
ioquake3 resync to revision 3511 from 3444. This updates from SDL 2.0.4 to SDL 2.0.8. Fix nullptr dereference in front of nullptr check in FS_CheckPak0 Fix undefined behaviour due to shifting signed in snd_mem.c Fix shifting bits out of byte in tr_font.c Fix shift into sign in cl_cin.c Fix signed bit operations in MSG_ReadBits Add missing address operator in cm_polylib.c OpenGL1: Decay float[8] to float * in tr_marks.c Avoid srcList[-1] in snd_openal.c Fix the behaviour of CVAR_LATCH\|CVAR_CHEAT cvars Maximize cURL buffer size Fix mouse grab after toggling fullscreen Fix q3history buffer not cleared between mods and OOB-access Revert "Removed "Color Depth" from q3_ui system settings, it didn't control anything." Fix displayed color/depth/stencil bits values Restore setting r_colorbits in q3_ui Make setting r_stencilbits more consistent in Team Arena UI Fix map list in Team Arena start server menu after entering SP menu Support SDL audio devices that require float32 samples. sdl_snd.c should just initialize SDL audio without checking SDL_WasInit(). There's no need to SDL_PauseAudio(1) before calling SDL_CloseAudio(). Added audio capture support to SDL backend. Use the SDL2 audio device interface instead of the legacy 1.2 API. Disable SDL audio capture until prebuilt SDL libraries are updated to 2.0.8. Update SDL2 to 2.0.8 Add SDL 2.0.1 headers for macOS PPC Make macOS Universal Bundle target 10.6 for x86 and x86_64 Fix possible bot goal state NULL pointer dereference Fix uninitialized bot_goal_t fields Remove unnecessary NULL pointer check in Cmd_RemoveCommand Make UI_DrawProportionalString handle NULL string Fix compiling against macOS system OpenAL and SDL2 frameworks Fix array index in CanDamage() function - discovered by MARTY Fix compiling Makefile (broke in macOS frameworks commit) Fix clearing keys for control in Team Arena UI Make s_useOpenAL be CVAR_LATCH Improvements for dedicated camera followers (team follow1/2) Fix not closing description.txt and fix path seperator Fix duplicate bots displayed in Team Arena ingame add bot menu OpenGL2: Fix parsing specularScale in shaders Don't allow SDL audio capture using pulseaudio Isolate the Altivec code so non-Altivec PPC targets can use the same binary. Limit -maltivec to specific source files on OpenBSD too (untested) Use SDL 2.0.1 headers for macOS ppc64 Fix console offset while Team Arena voiceMenu is open OpenGL2: Readd r_deluxeSpecular. Fix client kicked as unpure when missing the latest cgame/ui pk3s Don't create multiple windows when GL context creation fails Require OpenGL 1.2 for GL_CLAMP_TO_EDGE Fix Linux uninstaller requiring Bash Fix Linux uninstaller redirecting stderr to stdout in preuninstall.sh Reported by @illwieckz. Fix in_restart causing fatal error while video is shutdown Allow pkg-config binary to be overridden with PKG_CONFIG Make testgun command without argument disable test gun model Remove unused renderer_buffer variable Don't upload 8 bit grayscale images as 16 bit luminance OpenGL1: Use RE_UploadCinematic() instead of duplicate code Don't load non-core GL functions for OpenGL 3.2 core context Load OpenGL ES 2.0 function procs Don't check fixed function GL extensions when using shader pipeline OpenGL2: Fix world VAO cache drawing when glIndex_t is unsigned short OpenGL2: Misc fixes and cleanup Fix IQM root joint backlerp when joint number is more than 0 Improve IQM loading Improve IQM CPU vertex skinning performance OpenGL2: Add GPU vertex skinning for IQM models 2018-07-30 11:35:12 +00:00			`/*`
			`===========================================================================`
			`Copyright (C) 1999-2005 Id Software, Inc.`

			`This file is part of Quake III Arena source code.`

			`Quake III Arena source code is free software; you can redistribute it`
			`and/or modify it under the terms of the GNU General Public License as`
			`published by the Free Software Foundation; either version 2 of the License,`
			`or (at your option) any later version.`

			`Quake III Arena source code is distributed in the hope that it will be`
			`useful, but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with Quake III Arena source code; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA`
			`===========================================================================`
			`*/`

			`/* This file is only compiled for PowerPC builds with Altivec support.`
			`Altivec intrinsics need to be in a separate file, so GCC's -maltivec`
			`command line can enable them, but give us the option to _not_ use that`
			`on other files, where the compiler might then generate Altivec`
			`instructions for normal floating point, crashing on G3 (etc) processors. */`

			`#include "client.h"`
			`#include "snd_local.h"`

			`#if idppc_altivec`

			`#if !defined(__APPLE__)`
			`#include <altivec.h>`
			`#endif`

			`void S_PaintChannelFrom16_altivec( portable_samplepair_t paintbuffer[PAINTBUFFER_SIZE], int snd_vol, channel_t ch, const sfx_t sc, int count, int sampleOffset, int bufferOffset ) {`
			`int data, aoff, boff;`
			`int leftvol, rightvol;`
			`int i, j;`
			`portable_samplepair_t *samp;`
			`sndBuffer *chunk;`
			`short *samples;`
			`float ooff, fdata[2], fdiv, fleftvol, frightvol;`

			`if (sc->soundChannels <= 0) {`
			`return;`
			`}`

			`samp = &paintbuffer[ bufferOffset ];`

			`if (ch->doppler) {`
			`sampleOffset = sampleOffset*ch->oldDopplerScale;`
			`}`

			`if ( sc->soundChannels == 2 ) {`
			`sampleOffset *= sc->soundChannels;`

			`if ( sampleOffset & 1 ) {`
			`sampleOffset &= ~1;`
			`}`
			`}`

			`chunk = sc->soundData;`
			`while (sampleOffset>=SND_CHUNK_SIZE) {`
			`chunk = chunk->next;`
			`sampleOffset -= SND_CHUNK_SIZE;`
			`if (!chunk) {`
			`chunk = sc->soundData;`
			`}`
			`}`

			`if (!ch->doppler \|\| ch->dopplerScale==1.0f) {`
			`vector signed short volume_vec;`
			`vector unsigned int volume_shift;`
			`int vectorCount, samplesLeft, chunkSamplesLeft;`
			`leftvol = ch->leftvol*snd_vol;`
			`rightvol = ch->rightvol*snd_vol;`
			`samples = chunk->sndChunk;`
			`((short *)&volume_vec)[0] = leftvol;`
			`((short *)&volume_vec)[1] = leftvol;`
			`((short *)&volume_vec)[4] = leftvol;`
			`((short *)&volume_vec)[5] = leftvol;`
			`((short *)&volume_vec)[2] = rightvol;`
			`((short *)&volume_vec)[3] = rightvol;`
			`((short *)&volume_vec)[6] = rightvol;`
			`((short *)&volume_vec)[7] = rightvol;`
			`volume_shift = vec_splat_u32(8);`
			`i = 0;`

			`while(i < count) {`
			`/* Try to align destination to 16-byte boundary */`
			`while(i < count && (((unsigned long)&samp[i] & 0x1f) \|\| ((count-i) < 8) \|\| ((SND_CHUNK_SIZE - sampleOffset) < 8))) {`
			`data = samples[sampleOffset++];`
			`samp[i].left += (data * leftvol)>>8;`

			`if ( sc->soundChannels == 2 ) {`
			`data = samples[sampleOffset++];`
			`}`
			`samp[i].right += (data * rightvol)>>8;`

			`if (sampleOffset == SND_CHUNK_SIZE) {`
			`chunk = chunk->next;`
			`samples = chunk->sndChunk;`
			`sampleOffset = 0;`
			`}`
			`i++;`
			`}`
			`/* Destination is now aligned. Process as many 8-sample`
			`chunks as we can before we run out of room from the current`
			`sound chunk. We do 8 per loop to avoid extra source data reads. */`
			`samplesLeft = count - i;`
			`chunkSamplesLeft = SND_CHUNK_SIZE - sampleOffset;`
			`if(samplesLeft > chunkSamplesLeft)`
			`samplesLeft = chunkSamplesLeft;`

			`vectorCount = samplesLeft / 8;`

			`if(vectorCount)`
			`{`
			`vector unsigned char tmp;`
			`vector short s0, s1, sampleData0, sampleData1;`
			`vector signed int merge0, merge1;`
			`vector signed int d0, d1, d2, d3;`
			`vector unsigned char samplePermute0 =`
			`VECCONST_UINT8(0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7);`
			`vector unsigned char samplePermute1 =`
			`VECCONST_UINT8(8, 9, 12, 13, 8, 9, 12, 13, 10, 11, 14, 15, 10, 11, 14, 15);`
			`vector unsigned char loadPermute0, loadPermute1;`

			`// Rather than permute the vectors after we load them to do the sample`
			`// replication and rearrangement, we permute the alignment vector so`
			`// we do everything in one step below and avoid data shuffling.`
			`tmp = vec_lvsl(0,&samples[sampleOffset]);`
			`loadPermute0 = vec_perm(tmp,tmp,samplePermute0);`
			`loadPermute1 = vec_perm(tmp,tmp,samplePermute1);`

			`s0 = (vector short )&samples[sampleOffset];`
			`while(vectorCount)`
			`{`
			`/* Load up source (16-bit) sample data */`
			`s1 = (vector short )&samples[sampleOffset+7];`

			`/* Load up destination sample data */`
			`d0 = (vector signed int )&samp[i];`
			`d1 = (vector signed int )&samp[i+2];`
			`d2 = (vector signed int )&samp[i+4];`
			`d3 = (vector signed int )&samp[i+6];`

			`sampleData0 = vec_perm(s0,s1,loadPermute0);`
			`sampleData1 = vec_perm(s0,s1,loadPermute1);`

			`merge0 = vec_mule(sampleData0,volume_vec);`
			`merge0 = vec_sra(merge0,volume_shift); /* Shift down to proper range */`

			`merge1 = vec_mulo(sampleData0,volume_vec);`
			`merge1 = vec_sra(merge1,volume_shift);`

			`d0 = vec_add(merge0,d0);`
			`d1 = vec_add(merge1,d1);`

			`merge0 = vec_mule(sampleData1,volume_vec);`
			`merge0 = vec_sra(merge0,volume_shift); /* Shift down to proper range */`

			`merge1 = vec_mulo(sampleData1,volume_vec);`
			`merge1 = vec_sra(merge1,volume_shift);`

			`d2 = vec_add(merge0,d2);`
			`d3 = vec_add(merge1,d3);`

			`/* Store destination sample data */`
			`(vector signed int )&samp[i] = d0;`
			`(vector signed int )&samp[i+2] = d1;`
			`(vector signed int )&samp[i+4] = d2;`
			`(vector signed int )&samp[i+6] = d3;`

			`i += 8;`
			`vectorCount--;`
			`s0 = s1;`
			`sampleOffset += 8;`
			`}`
			`if (sampleOffset == SND_CHUNK_SIZE) {`
			`chunk = chunk->next;`
			`samples = chunk->sndChunk;`
			`sampleOffset = 0;`
			`}`
			`}`
			`}`
			`} else {`
			`fleftvol = ch->leftvol*snd_vol;`
			`frightvol = ch->rightvol*snd_vol;`

			`ooff = sampleOffset;`
			`samples = chunk->sndChunk;`

			`for ( i=0 ; i<count ; i++ ) {`

			`aoff = ooff;`
			`ooff = ooff + ch->dopplerScale * sc->soundChannels;`
			`boff = ooff;`
			`fdata[0] = fdata[1] = 0;`
			`for (j=aoff; j<boff; j += sc->soundChannels) {`
			`if (j == SND_CHUNK_SIZE) {`
			`chunk = chunk->next;`
			`if (!chunk) {`
			`chunk = sc->soundData;`
			`}`
			`samples = chunk->sndChunk;`
			`ooff -= SND_CHUNK_SIZE;`
			`}`
			`if ( sc->soundChannels == 2 ) {`
			`fdata[0] += samples[j&(SND_CHUNK_SIZE-1)];`
			`fdata[1] += samples[(j+1)&(SND_CHUNK_SIZE-1)];`
			`} else {`
			`fdata[0] += samples[j&(SND_CHUNK_SIZE-1)];`
			`fdata[1] += samples[j&(SND_CHUNK_SIZE-1)];`
			`}`
			`}`
			`fdiv = 256 * (boff-aoff) / sc->soundChannels;`
			`samp[i].left += (fdata[0] * fleftvol)/fdiv;`
			`samp[i].right += (fdata[1] * frightvol)/fdiv;`
			`}`
			`}`
			`}`


			`#endif`