Use VFPU for most memcpy operations - backported from ADQ

This commit is contained in:
moto 2022-05-19 12:58:48 -04:00
parent 431f5bd09f
commit a2ce78450d
21 changed files with 387 additions and 63 deletions

View file

@ -337,12 +337,12 @@ void SetPal (int i)
void CL_CopyPlayerInfo (entity_t *ent, entity_t *player)
{
memcpy (&ent->baseline, &player->baseline, sizeof(entity_state_t));
memcpy_vfpu(&ent->baseline, &player->baseline, sizeof(entity_state_t));
ent->msgtime = player->msgtime;
memcpy (ent->msg_origins, player->msg_origins, sizeof(ent->msg_origins));
memcpy_vfpu(ent->msg_origins, player->msg_origins, sizeof(ent->msg_origins));
VectorCopy (player->origin, ent->origin);
memcpy (ent->msg_angles, player->msg_angles, sizeof(ent->msg_angles));
memcpy_vfpu(ent->msg_angles, player->msg_angles, sizeof(ent->msg_angles));
VectorCopy (player->angles, ent->angles);
ent->model = (ent == &q3player_body.ent) ? cl.model_precache[cl_modelindex[mi_q3torso]] : cl.model_precache[cl_modelindex[mi_q3head]];

View file

@ -205,7 +205,7 @@ void CL_KeepaliveMessage (void)
// read messages from server, should just be nops
old = net_message;
memcpy (olddata, net_message.data, net_message.cursize);
memcpy_vfpu(olddata, net_message.data, net_message.cursize);
do
{
@ -227,7 +227,7 @@ void CL_KeepaliveMessage (void)
} while (ret);
net_message = old;
memcpy (net_message.data, olddata, net_message.cursize);
memcpy_vfpu(net_message.data, olddata, net_message.cursize);
// check time
time = Sys_FloatTime ();

View file

@ -124,9 +124,9 @@ void SList_Switch (int a, int b)
if (b >= MAX_SERVER_LIST || b < 0)
Sys_Error ("SList_Switch: Bad index %d", b);
memcpy (&temp, &slist[a], sizeof(temp));
memcpy (&slist[a], &slist[b], sizeof(temp));
memcpy (&slist[b], &temp, sizeof(temp));
memcpy_vfpu(&temp, &slist[a], sizeof(temp));
memcpy_vfpu(&slist[a], &slist[b], sizeof(temp));
memcpy_vfpu(&slist[b], &temp, sizeof(temp));
}
int SList_Length (void)

View file

@ -164,7 +164,7 @@ void Cbuf_Execute (void)
}
memcpy (line, text, i);
memcpy_vfpu(line, text, i);
line[i] = 0;
// delete the text from the command buffer and move remaining commands down

View file

@ -406,9 +406,9 @@ void Host_SavegameComment (char *text)
for (i=0 ; i<SAVEGAME_COMMENT_LENGTH ; i++)
text[i] = ' ';
memcpy (text, cl.levelname, strlen(cl.levelname));
memcpy_vfpu(text, cl.levelname, strlen(cl.levelname));
sprintf (kills,"kills:%3i/%3i", cl.stats[STAT_INSTA], cl.stats[STAT_ROUNDCHANGE]);
memcpy (text+22, kills, strlen(kills));
memcpy_vfpu(text+22, kills, strlen(kills));
// convert space to _ to make stdio happy
for (i=0 ; i<SAVEGAME_COMMENT_LENGTH ; i++)
if (text[i] == ' ')

View file

@ -183,7 +183,7 @@ void RotatePointAroundVector( vec3_t dst, const vec3_t dir, const vec3_t point,
m[1][2] = vf[1];
m[2][2] = vf[2];
memcpy( im, m, sizeof( im ) );
memcpy_vfpu( im, m, sizeof( im ) );
im[0][1] = m[1][0];
im[0][2] = m[2][0];

View file

@ -286,16 +286,16 @@ void M_BuildTranslationTable(int top, int bottom)
identityTable[j] = j;
dest = translationTable;
source = identityTable;
memcpy (dest, source, 256);
memcpy_vfpu(dest, source, 256);
if (top < 128) // the artists made some backwards ranges. sigh.
memcpy (dest + TOP_RANGE, source + top, 16);
memcpy_vfpu(dest + TOP_RANGE, source + top, 16);
else
for (j=0 ; j<16 ; j++)
dest[TOP_RANGE+j] = source[top+15-j];
if (bottom < 128)
memcpy (dest + BOTTOM_RANGE, source + bottom, 16);
memcpy_vfpu(dest + BOTTOM_RANGE, source + bottom, 16);
else
for (j=0 ; j<16 ; j++)
dest[BOTTOM_RANGE+j] = source[bottom+15-j];

View file

@ -940,7 +940,7 @@ int PF_newcheckclient (int check)
VectorAdd (ent->v.origin, ent->v.view_ofs, org);
leaf = Mod_PointInLeaf (org, sv.worldmodel);
pvs = Mod_LeafPVS (leaf, sv.worldmodel);
memcpy (checkpvs, pvs, (sv.worldmodel->numleafs+7)>>3 );
memcpy_vfpu(checkpvs, pvs, (sv.worldmodel->numleafs+7)>>3 );
return i;
}

View file

@ -94,9 +94,9 @@ void IN_Init (void)
buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_RIGHT)] = K_RIGHTARROW;
buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_DOWN)] = K_DOWNARROW;
buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_LEFT)] = K_LEFTARROW;
memcpy(buttonToConsoleKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap));
memcpy(buttonToMessageKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap));
memcpy(buttonToMenuKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap));
memcpy_vfpu(buttonToConsoleKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap));
memcpy_vfpu(buttonToMessageKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap));
memcpy_vfpu(buttonToMenuKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap));
// Game keys:
buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_LTRIGGER)] = K_AUX1;
@ -115,7 +115,7 @@ void IN_Init (void)
buttonToConsoleKeyMap[buttonMaskToShift(PSP_CTRL_SQUARE)] = K_INS;
// Message keys:
memcpy(buttonToMessageKeyMap, buttonToConsoleKeyMap, sizeof(ButtonToKeyMap));
memcpy_vfpu(buttonToMessageKeyMap, buttonToConsoleKeyMap, sizeof(ButtonToKeyMap));
// Menu keys:
buttonToMenuKeyMap[buttonMaskToShift(PSP_CTRL_SQUARE)] = K_INS;

View file

@ -77,7 +77,7 @@ static pdpStatStruct gPdpStat;
pdpStatStruct *findPdpStat(int socket, pdpStatStruct *pdpStat)
{
if(socket == pdpStat->pdpId) {
memcpy(&gPdpStat, pdpStat, sizeof(pdpStatStruct));
memcpy_vfpu(&gPdpStat, pdpStat, sizeof(pdpStatStruct));
return &gPdpStat;
}
if(pdpStat->next) return findPdpStat(socket, pdpStat->next);
@ -863,7 +863,7 @@ namespace quake
pdpStatStruct *tempPdp = findPdpStat(socket, pdpStat);
if(tempPdp < 0) return -1;
memcpy(((struct sockaddr_adhoc *)addr)->mac, tempPdp->mac, 6);
memcpy_vfpu(((struct sockaddr_adhoc *)addr)->mac, tempPdp->mac, 6);
((struct sockaddr_adhoc *)addr)->port = tempPdp->port;
addr->sa_family = ADHOC_NET;
return 0;

View file

@ -488,7 +488,7 @@ qpic_t *Draw_CacheImg (char *path)
// the translatable player picture just for the menu
// configuration dialog
if (!strcmp (path, "gfx/menuplyr.lmp"))
memcpy (menuplyr_pixels, dat->data, dat->width*dat->height);
memcpy(menuplyr_pixels, dat->data, dat->width*dat->height);
pic->pic.width = dat->width;
pic->pic.height = dat->height;
@ -3398,16 +3398,6 @@ void GL_Upload4(int texture_index, const byte *data, int width, int height)
memcpy(texture.palette, data + buffer_size, 16 * 4);
int i;
// Copy to VRAM?
/*if (texture.vram)
{
// Copy.
memcpy(texture.vram, texture.ram, buffer_size);
// Flush the data cache.
sceKernelDcacheWritebackRange(texture.vram, buffer_size);
}*/
// Flush the data cache.
sceKernelDcacheWritebackRange(texture.ram, buffer_size);
}

View file

@ -130,7 +130,7 @@ qboolean Mod_LoadHLModel (model_t *mod, void *buffer)
model = static_cast<hlmodelcache_t*>(Hunk_Alloc(sizeof(hlmodelcache_t)));
header = static_cast<hlmdl_header_t*>(Hunk_Alloc(com_filesize));
memcpy(header, buffer, com_filesize);
memcpy_vfpu(header, buffer, com_filesize);
if (header->version != 10)
{
@ -175,7 +175,7 @@ qboolean Mod_LoadHLModel (model_t *mod, void *buffer)
if (!mod->cache.data)
return qfalse;
memcpy (mod->cache.data, model, total);
memcpy_vfpu(mod->cache.data, model, total);
Hunk_FreeToLowMark (start);
return qtrue;
@ -482,7 +482,7 @@ void HL_SetupBones(hlmodel_t *model)
}
else
{
memcpy(transform_matrix[i], matrix, 12 * sizeof(float));
memcpy_vfpu(transform_matrix[i], matrix, 12 * sizeof(float));
}
}
}

View file

@ -260,7 +260,7 @@ byte* LoadPCX (FILE *f, int matchwidth, int matchheight)
image_width = pcx->xmax+1;
image_height = pcx->ymax+1;
memcpy(image_palette, palette, sizeof(palette));
memcpy_vfpu(image_palette, palette, sizeof(palette));
image_palette_type = PAL_RGB;
fclose (f);
@ -304,7 +304,7 @@ byte *LoadWAL (char *name)
size = width * height;
data = static_cast<byte*>(malloc(size));
memcpy(data, (byte *)mt + ofs, size);
memcpy_vfpu(data, (byte *)mt + ofs, size);
image_palette_type = PAL_Q2;

View file

@ -343,7 +343,7 @@ void GL_MakeAliasModelDisplayListsH2 (model_t *m, aliashdr_t *hdr)
int* cmds = static_cast<int*>(Hunk_Alloc (numcommands * 4));
paliashdr->commands = (byte *)cmds - (byte *)paliashdr;
memcpy (cmds, commands, numcommands * 4);
memcpy_vfpu(cmds, commands, numcommands * 4);
trivertx_t* verts = static_cast<trivertx_t*>(Hunk_Alloc (paliashdr->numposes * paliashdr->poseverts
* sizeof(trivertx_t) ));
@ -420,7 +420,7 @@ void GL_MakeAliasModelDisplayLists (model_t *m, aliashdr_t *hdr)
cmds = static_cast<int*>(Hunk_Alloc (numcommands * 4));
paliashdr->commands = (byte *)cmds - (byte *)paliashdr;
memcpy (cmds, commands, numcommands * 4);
memcpy_vfpu(cmds, commands, numcommands * 4);
verts = static_cast<trivertx_t*>(Hunk_Alloc (paliashdr->numposes * paliashdr->poseverts
* sizeof(trivertx_t) ));

View file

@ -537,13 +537,13 @@ void Mod_LoadTextures (lump_t *l)
loadmodel->textures[i] = tx;
memcpy (tx->name, mt->name, sizeof(tx->name));
memcpy_vfpu(tx->name, mt->name, sizeof(tx->name));
tx->width = mt->width;
tx->height = mt->height;
for (j=0 ; j<MIPLEVELS ; j++)
tx->offsets[j] = mt->offsets[j] + sizeof(texture_t) - sizeof(miptex_t);
// the pixels immediately follow the structures
memcpy ( tx_pixels, mt+1, pixels);
memcpy_vfpu( tx_pixels, mt+1, pixels);
int level = 0;
if (r_mipmaps.value > 0)
@ -776,7 +776,7 @@ void Mod_LoadLighting (lump_t *l)
return;
}
loadmodel->lightdata = static_cast<byte*>(Hunk_AllocName ( l->filelen, loadname));
memcpy (loadmodel->lightdata, mod_base + l->fileofs, l->filelen);
memcpy_vfpu(loadmodel->lightdata, mod_base + l->fileofs, l->filelen);
return;
}
@ -816,7 +816,7 @@ void Mod_LoadLighting (lump_t *l)
loadmodel->lightdata = static_cast<byte*>(Hunk_AllocName ( l->filelen*3, litfilename));
in = loadmodel->lightdata + l->filelen*2; // place the file at the end, so it will not be overwritten until the very last write
out = loadmodel->lightdata;
memcpy (in, mod_base + l->fileofs, l->filelen);
memcpy_vfpu(in, mod_base + l->fileofs, l->filelen);
for (i = 0;i < l->filelen;i++)
{
d = *in++;
@ -853,7 +853,7 @@ void Mod_HL_LoadLighting (lump_t *l)
}
loadmodel->lightdata = static_cast<byte*>(Hunk_AllocName ( l->filelen, loadname));
memcpy (loadmodel->lightdata, mod_base + l->fileofs, l->filelen);
memcpy_vfpu(loadmodel->lightdata, mod_base + l->fileofs, l->filelen);
}
/*
@ -871,7 +871,7 @@ void Mod_LoadVisibility (lump_t *l)
}
loadmodel->visdata = static_cast<byte*>(Hunk_AllocName ( l->filelen, loadname));
memcpy (loadmodel->visdata, mod_base + l->fileofs, l->filelen);
memcpy_vfpu(loadmodel->visdata, mod_base + l->fileofs, l->filelen);
}
/*
@ -980,7 +980,7 @@ void Mod_LoadEntities (lump_t *l)
}
loadmodel->entities = static_cast<char*>(Hunk_AllocName ( l->filelen, entfilename));
memcpy (loadmodel->entities, mod_base + l->fileofs, l->filelen);
memcpy_vfpu(loadmodel->entities, mod_base + l->fileofs, l->filelen);
if (loadmodel->bspversion == HL_BSPVERSION || loadmodel->bspversion == NZP_BSPVERSION)
Mod_ParseWadsFromEntityLump(loadmodel->entities);
@ -2324,7 +2324,7 @@ void Mod_LoadAliasModel (model_t *mod, void *buffer)
if (!mod->cache.data)
return;
memcpy (mod->cache.data, pheader, total);
memcpy_vfpu(mod->cache.data, pheader, total);
Hunk_FreeToLowMark (start);
}
@ -2496,7 +2496,7 @@ void Mod_LoadH2AliasModel (model_t *mod, void *buffer)
Cache_Alloc (&mod->cache, total, loadname);
if (!mod->cache.data)
return;
memcpy (mod->cache.data, pheader, total);
memcpy_vfpu(mod->cache.data, pheader, total);
Hunk_FreeToLowMark (start);
}
@ -2650,7 +2650,7 @@ void Mod_LoadQ2AliasModel (model_t *mod, void *buffer)
Cache_Alloc (&mod->cache, total, loadname);
if (!mod->cache.data)
return;
memcpy (mod->cache.data, pheader, total);
memcpy_vfpu(mod->cache.data, pheader, total);
Hunk_FreeToLowMark (start);
}
@ -3247,7 +3247,7 @@ void Mod_LoadQ3AliasModel (model_t *mod, void *buffer)
if (!mod->cache.data)
return;
memcpy (header, buffer, com_filesize);
memcpy_vfpu(header, buffer, com_filesize);
base = com_filesize;
mod->type = mod_md3;

View file

@ -74,7 +74,7 @@ static void Image_Resample32 (void *indata, int inwidth, int inheight,void *outd
{
inrow = (byte *) indata + inwidth4 * yi;
if (yi == oldy + 1)
memcpy(row1, row2, outwidth4);
memcpy_vfpu(row1, row2, outwidth4);
else
Image_Resample32LerpLine (inrow, row1, inwidth, outwidth);
Image_Resample32LerpLine (inrow + inwidth4, row2, inwidth, outwidth);
@ -116,12 +116,12 @@ static void Image_Resample32 (void *indata, int inwidth, int inheight,void *outd
{
inrow = (byte *) indata + inwidth4 * yi;
if (yi == oldy+1)
memcpy(row1, row2, outwidth4);
memcpy_vfpu(row1, row2, outwidth4);
else
Image_Resample32LerpLine (inrow, row1, inwidth, outwidth);
oldy = yi;
}
memcpy(out, row1, outwidth4);
memcpy_vfpu(out, row1, outwidth4);
}
}
free(memalloc);
@ -221,7 +221,7 @@ static void Image_Resample24 (void *indata, int inwidth, int inheight,
if (yi != oldy) {
inrow = (byte *) indata + inwidth3 * yi;
if (yi == oldy + 1)
memcpy(row1, row2, outwidth3);
memcpy_vfpu(row1, row2, outwidth3);
else
Image_Resample24LerpLine (inrow, row1, inwidth, outwidth);
Image_Resample24LerpLine (inrow + inwidth3, row2, inwidth, outwidth);
@ -259,12 +259,12 @@ static void Image_Resample24 (void *indata, int inwidth, int inheight,
if (yi != oldy) {
inrow = (byte *) indata + inwidth3 * yi;
if (yi == oldy+1)
memcpy(row1, row2, outwidth3);
memcpy_vfpu(row1, row2, outwidth3);
else
Image_Resample24LerpLine (inrow, row1, inwidth, outwidth);
oldy = yi;
}
memcpy(out, row1, outwidth3);
memcpy_vfpu(out, row1, outwidth3);
}
}
free(memalloc);

View file

@ -232,7 +232,7 @@ void DumpChunks(void)
data_p=iff_data;
do
{
memcpy (str, data_p, 4);
memcpy_vfpu(str, data_p, 4);
data_p += 4;
iff_chunk_len = GetLittleLong();
Con_Printf ("0x%x : %s (%d)\n", (int)(data_p - 4), str, iff_chunk_len);

View file

@ -263,7 +263,7 @@ void SV_ConnectClient (int clientnum)
netconnection = client->netconnection;
if (sv.loadgame)
memcpy (spawn_parms, client->spawn_parms, sizeof(spawn_parms));
memcpy_vfpu(spawn_parms, client->spawn_parms, sizeof(spawn_parms));
memset (client, 0, sizeof(*client));
client->netconnection = netconnection;
@ -278,7 +278,7 @@ void SV_ConnectClient (int clientnum)
client->privileged = false;
if (sv.loadgame)
memcpy (client->spawn_parms, spawn_parms, sizeof(spawn_parms));
memcpy_vfpu(client->spawn_parms, spawn_parms, sizeof(spawn_parms));
else
{
// call the progs to get default spawn parms for the new client

View file

@ -1713,7 +1713,7 @@ trace_t SV_Trace_Toss (edict_t *ent, edict_t *ignore)
save_frametime = host_frametime;
host_frametime = 0.05;
memcpy (&tempent, ent, sizeof(edict_t));
memcpy_vfpu(&tempent, ent, sizeof(edict_t));
tent = &tempent;
while (1)

View file

@ -48,6 +48,340 @@ typedef struct
void Cache_FreeLow (int new_low_hunk);
void Cache_FreeHigh (int new_high_hunk);
void* memcpy_vfpu( void* dst, void* src, unsigned int size )
{
u8* src8 = (u8*)src;
u8* dst8 = (u8*)dst;
// < 8 isn't worth trying any optimisations...
if (size<8) goto bytecopy;
// < 64 means we don't gain anything from using vfpu...
if (size<64)
{
// Align dst on 4 bytes or just resume if already done
while (((((u32)dst8) & 0x3)!=0) && size) {
*dst8++ = *src8++;
size--;
}
if (size<4) goto bytecopy;
// We are dst aligned now and >= 4 bytes to copy
u32* src32 = (u32*)src8;
u32* dst32 = (u32*)dst8;
switch(((u32)src8)&0x3)
{
case 0:
while (size&0xC)
{
*dst32++ = *src32++;
size -= 4;
}
if (size==0) return (dst); // fast out
while (size>=16)
{
*dst32++ = *src32++;
*dst32++ = *src32++;
*dst32++ = *src32++;
*dst32++ = *src32++;
size -= 16;
}
if (size==0) return (dst); // fast out
src8 = (u8*)src32;
dst8 = (u8*)dst32;
break;
default:
{
register u32 a, b, c, d;
while (size>=4)
{
a = *src8++;
b = *src8++;
c = *src8++;
d = *src8++;
*dst32++ = (d << 24) | (c << 16) | (b << 8) | a;
size -= 4;
}
if (size==0) return (dst); // fast out
dst8 = (u8*)dst32;
}
break;
}
goto bytecopy;
}
// Align dst on 16 bytes to gain from vfpu aligned stores
while ((((u32)dst8) & 0xF)!=0 && size) {
*dst8++ = *src8++;
size--;
}
// We use uncached dst to use VFPU writeback and free cpu cache for src only
u8* udst8 = (u8*)((u32)dst8 | 0x40000000);
// We need the 64 byte aligned address to make sure the dcache is invalidated correctly
u8* dst64a = ((u32)dst8&~0x3F);
// Invalidate the first line that matches up to the dst start
if (size>=64)
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%0)\n"
"addiu %0, %0, 64\n"
"sync\n"
".set pop\n"
:"+r"(dst64a));
switch(((u32)src8&0xF))
{
// src aligned on 16 bytes too? nice!
case 0:
while (size>=64)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%2)\n" // Dcache writeback invalidate
"lv.q c000, 0(%1)\n"
"lv.q c010, 16(%1)\n"
"lv.q c020, 32(%1)\n"
"lv.q c030, 48(%1)\n"
"sync\n" // Wait for allegrex writeback
"sv.q c000, 0(%0), wb\n"
"sv.q c010, 16(%0), wb\n"
"sv.q c020, 32(%0), wb\n"
"sv.q c030, 48(%0), wb\n"
// Lots of variable updates... but get hidden in sv.q latency anyway
"addiu %3, %3, -64\n"
"addiu %2, %2, 64\n"
"addiu %1, %1, 64\n"
"addiu %0, %0, 64\n"
".set pop\n" // restore assembler option
:"+r"(udst8),"+r"(src8),"+r"(dst64a),"+r"(size)
:
:"memory"
);
}
if (size>16)
{
// Invalidate the last cache line where the max remaining 63 bytes are
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%0)\n"
"sync\n"
".set pop\n" // restore assembler option
::"r"(dst64a));
while (size>=16)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"lv.q c000, 0(%1)\n"
"sv.q c000, 0(%0), wb\n"
// Lots of variable updates... but get hidden in sv.q latency anyway
"addiu %2, %2, -16\n"
"addiu %1, %1, 16\n"
"addiu %0, %0, 16\n"
".set pop\n" // restore assembler option
:"+r"(udst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
}
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"vflush\n" // Flush VFPU writeback cache
".set pop\n" // restore assembler option
);
dst8 = (u8*)((u32)udst8 & ~0x40000000);
break;
// src is only qword unaligned but word aligned? We can at least use ulv.q
case 4:
case 8:
case 12:
while (size>=64)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%2)\n" // Dcache writeback invalidate
"ulv.q c000, 0(%1)\n"
"ulv.q c010, 16(%1)\n"
"ulv.q c020, 32(%1)\n"
"ulv.q c030, 48(%1)\n"
"sync\n" // Wait for allegrex writeback
"sv.q c000, 0(%0), wb\n"
"sv.q c010, 16(%0), wb\n"
"sv.q c020, 32(%0), wb\n"
"sv.q c030, 48(%0), wb\n"
// Lots of variable updates... but get hidden in sv.q latency anyway
"addiu %3, %3, -64\n"
"addiu %2, %2, 64\n"
"addiu %1, %1, 64\n"
"addiu %0, %0, 64\n"
".set pop\n" // restore assembler option
:"+r"(udst8),"+r"(src8),"+r"(dst64a),"+r"(size)
:
:"memory"
);
}
if (size>16)
// Invalidate the last cache line where the max remaining 63 bytes are
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%0)\n"
"sync\n"
".set pop\n" // restore assembler option
::"r"(dst64a));
while (size>=16)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"ulv.q c000, 0(%1)\n"
"sv.q c000, 0(%0), wb\n"
// Lots of variable updates... but get hidden in sv.q latency anyway
"addiu %2, %2, -16\n"
"addiu %1, %1, 16\n"
"addiu %0, %0, 16\n"
".set pop\n" // restore assembler option
:"+r"(udst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"vflush\n" // Flush VFPU writeback cache
".set pop\n" // restore assembler option
);
dst8 = (u8*)((u32)udst8 & ~0x40000000);
break;
// src not aligned? too bad... have to use unaligned reads
default:
while (size>=64)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%2)\n"
"lwr $8, 0(%1)\n" //
"lwl $8, 3(%1)\n" // $8 = *(s + 0)
"lwr $9, 4(%1)\n" //
"lwl $9, 7(%1)\n" // $9 = *(s + 4)
"lwr $10, 8(%1)\n" //
"lwl $10, 11(%1)\n" // $10 = *(s + 8)
"lwr $11, 12(%1)\n" //
"lwl $11, 15(%1)\n" // $11 = *(s + 12)
"mtv $8, s000\n"
"mtv $9, s001\n"
"mtv $10, s002\n"
"mtv $11, s003\n"
"lwr $8, 16(%1)\n"
"lwl $8, 19(%1)\n"
"lwr $9, 20(%1)\n"
"lwl $9, 23(%1)\n"
"lwr $10, 24(%1)\n"
"lwl $10, 27(%1)\n"
"lwr $11, 28(%1)\n"
"lwl $11, 31(%1)\n"
"mtv $8, s010\n"
"mtv $9, s011\n"
"mtv $10, s012\n"
"mtv $11, s013\n"
"lwr $8, 32(%1)\n"
"lwl $8, 35(%1)\n"
"lwr $9, 36(%1)\n"
"lwl $9, 39(%1)\n"
"lwr $10, 40(%1)\n"
"lwl $10, 43(%1)\n"
"lwr $11, 44(%1)\n"
"lwl $11, 47(%1)\n"
"mtv $8, s020\n"
"mtv $9, s021\n"
"mtv $10, s022\n"
"mtv $11, s023\n"
"lwr $8, 48(%1)\n"
"lwl $8, 51(%1)\n"
"lwr $9, 52(%1)\n"
"lwl $9, 55(%1)\n"
"lwr $10, 56(%1)\n"
"lwl $10, 59(%1)\n"
"lwr $11, 60(%1)\n"
"lwl $11, 63(%1)\n"
"mtv $8, s030\n"
"mtv $9, s031\n"
"mtv $10, s032\n"
"mtv $11, s033\n"
"sync\n"
"sv.q c000, 0(%0), wb\n"
"sv.q c010, 16(%0), wb\n"
"sv.q c020, 32(%0), wb\n"
"sv.q c030, 48(%0), wb\n"
// Lots of variable updates... but get hidden in sv.q latency anyway
"addiu %3, %3, -64\n"
"addiu %2, %2, 64\n"
"addiu %1, %1, 64\n"
"addiu %0, %0, 64\n"
".set pop\n" // restore assembler option
:"+r"(udst8),"+r"(src8),"+r"(dst64a),"+r"(size)
:
:"$8","$9","$10","$11","memory"
);
}
if (size>16)
// Invalidate the last cache line where the max remaining 63 bytes are
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"cache 0x1B, 0(%0)\n"
"sync\n"
".set pop\n" // restore assembler option
::"r"(dst64a));
while (size>=16)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"lwr $8, 0(%1)\n" //
"lwl $8, 3(%1)\n" // $8 = *(s + 0)
"lwr $9, 4(%1)\n" //
"lwl $9, 7(%1)\n" // $9 = *(s + 4)
"lwr $10, 8(%1)\n" //
"lwl $10, 11(%1)\n" // $10 = *(s + 8)
"lwr $11, 12(%1)\n" //
"lwl $11, 15(%1)\n" // $11 = *(s + 12)
"mtv $8, s000\n"
"mtv $9, s001\n"
"mtv $10, s002\n"
"mtv $11, s003\n"
"sv.q c000, 0(%0), wb\n"
// Lots of variable updates... but get hidden in sv.q latency anyway
"addiu %2, %2, -16\n"
"addiu %1, %1, 16\n"
"addiu %0, %0, 16\n"
".set pop\n" // restore assembler option
:"+r"(udst8),"+r"(src8),"+r"(size)
:
:"$8","$9","$10","$11","memory"
);
}
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"vflush\n" // Flush VFPU writeback cache
".set pop\n" // restore assembler option
);
dst8 = (u8*)((u32)udst8 & ~0x40000000);
break;
}
bytecopy:
// Copy the remains byte per byte...
while (size--)
{
*dst8++ = *src8++;
}
return (dst);
}
/*
==============================================================================
@ -374,7 +708,7 @@ void Hunk_Print (qboolean all)
//
// print the single block
//
memcpy (name, h->name, 8);
memcpy_vfpu(name, h->name, 8);
if (all)
Con_Printf ("%8p :%8i %8s\n",h, h->size, name);

View file

@ -127,5 +127,5 @@ void *Cache_Alloc (cache_user_t *c, int size, char *name);
void Cache_Report (void);
void* memcpy_vfpu(void* dst, void* src, unsigned int size);