diff --git a/source/cl_main.c b/source/cl_main.c index c2226c6..5a9a025 100644 --- a/source/cl_main.c +++ b/source/cl_main.c @@ -337,12 +337,12 @@ void SetPal (int i) void CL_CopyPlayerInfo (entity_t *ent, entity_t *player) { - memcpy (&ent->baseline, &player->baseline, sizeof(entity_state_t)); + memcpy_vfpu(&ent->baseline, &player->baseline, sizeof(entity_state_t)); ent->msgtime = player->msgtime; - memcpy (ent->msg_origins, player->msg_origins, sizeof(ent->msg_origins)); + memcpy_vfpu(ent->msg_origins, player->msg_origins, sizeof(ent->msg_origins)); VectorCopy (player->origin, ent->origin); - memcpy (ent->msg_angles, player->msg_angles, sizeof(ent->msg_angles)); + memcpy_vfpu(ent->msg_angles, player->msg_angles, sizeof(ent->msg_angles)); VectorCopy (player->angles, ent->angles); ent->model = (ent == &q3player_body.ent) ? cl.model_precache[cl_modelindex[mi_q3torso]] : cl.model_precache[cl_modelindex[mi_q3head]]; diff --git a/source/cl_parse.c b/source/cl_parse.c index 0ea02cd..7811f91 100644 --- a/source/cl_parse.c +++ b/source/cl_parse.c @@ -205,7 +205,7 @@ void CL_KeepaliveMessage (void) // read messages from server, should just be nops old = net_message; - memcpy (olddata, net_message.data, net_message.cursize); + memcpy_vfpu(olddata, net_message.data, net_message.cursize); do { @@ -227,7 +227,7 @@ void CL_KeepaliveMessage (void) } while (ret); net_message = old; - memcpy (net_message.data, olddata, net_message.cursize); + memcpy_vfpu(net_message.data, olddata, net_message.cursize); // check time time = Sys_FloatTime (); diff --git a/source/cl_slist.c b/source/cl_slist.c index bc5dc97..ccbfa3b 100644 --- a/source/cl_slist.c +++ b/source/cl_slist.c @@ -124,9 +124,9 @@ void SList_Switch (int a, int b) if (b >= MAX_SERVER_LIST || b < 0) Sys_Error ("SList_Switch: Bad index %d", b); - memcpy (&temp, &slist[a], sizeof(temp)); - memcpy (&slist[a], &slist[b], sizeof(temp)); - memcpy (&slist[b], &temp, sizeof(temp)); + memcpy_vfpu(&temp, &slist[a], sizeof(temp)); + memcpy_vfpu(&slist[a], &slist[b], sizeof(temp)); + memcpy_vfpu(&slist[b], &temp, sizeof(temp)); } int SList_Length (void) diff --git a/source/cmd.c b/source/cmd.c index 7f6d0b6..e1355ab 100644 --- a/source/cmd.c +++ b/source/cmd.c @@ -164,7 +164,7 @@ void Cbuf_Execute (void) } - memcpy (line, text, i); + memcpy_vfpu(line, text, i); line[i] = 0; // delete the text from the command buffer and move remaining commands down diff --git a/source/host_cmd.c b/source/host_cmd.c index 8390f9d..93d905c 100644 --- a/source/host_cmd.c +++ b/source/host_cmd.c @@ -406,9 +406,9 @@ void Host_SavegameComment (char *text) for (i=0 ; iv.origin, ent->v.view_ofs, org); leaf = Mod_PointInLeaf (org, sv.worldmodel); pvs = Mod_LeafPVS (leaf, sv.worldmodel); - memcpy (checkpvs, pvs, (sv.worldmodel->numleafs+7)>>3 ); + memcpy_vfpu(checkpvs, pvs, (sv.worldmodel->numleafs+7)>>3 ); return i; } diff --git a/source/psp/input.cpp b/source/psp/input.cpp index 58a4446..780cfa0 100644 --- a/source/psp/input.cpp +++ b/source/psp/input.cpp @@ -94,9 +94,9 @@ void IN_Init (void) buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_RIGHT)] = K_RIGHTARROW; buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_DOWN)] = K_DOWNARROW; buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_LEFT)] = K_LEFTARROW; - memcpy(buttonToConsoleKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap)); - memcpy(buttonToMessageKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap)); - memcpy(buttonToMenuKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap)); + memcpy_vfpu(buttonToConsoleKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap)); + memcpy_vfpu(buttonToMessageKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap)); + memcpy_vfpu(buttonToMenuKeyMap, buttonToGameKeyMap, sizeof(ButtonToKeyMap)); // Game keys: buttonToGameKeyMap[buttonMaskToShift(PSP_CTRL_LTRIGGER)] = K_AUX1; @@ -115,7 +115,7 @@ void IN_Init (void) buttonToConsoleKeyMap[buttonMaskToShift(PSP_CTRL_SQUARE)] = K_INS; // Message keys: - memcpy(buttonToMessageKeyMap, buttonToConsoleKeyMap, sizeof(ButtonToKeyMap)); + memcpy_vfpu(buttonToMessageKeyMap, buttonToConsoleKeyMap, sizeof(ButtonToKeyMap)); // Menu keys: buttonToMenuKeyMap[buttonMaskToShift(PSP_CTRL_SQUARE)] = K_INS; diff --git a/source/psp/network_psp.cpp b/source/psp/network_psp.cpp index 3f27025..e528080 100644 --- a/source/psp/network_psp.cpp +++ b/source/psp/network_psp.cpp @@ -77,7 +77,7 @@ static pdpStatStruct gPdpStat; pdpStatStruct *findPdpStat(int socket, pdpStatStruct *pdpStat) { if(socket == pdpStat->pdpId) { - memcpy(&gPdpStat, pdpStat, sizeof(pdpStatStruct)); + memcpy_vfpu(&gPdpStat, pdpStat, sizeof(pdpStatStruct)); return &gPdpStat; } if(pdpStat->next) return findPdpStat(socket, pdpStat->next); @@ -863,7 +863,7 @@ namespace quake pdpStatStruct *tempPdp = findPdpStat(socket, pdpStat); if(tempPdp < 0) return -1; - memcpy(((struct sockaddr_adhoc *)addr)->mac, tempPdp->mac, 6); + memcpy_vfpu(((struct sockaddr_adhoc *)addr)->mac, tempPdp->mac, 6); ((struct sockaddr_adhoc *)addr)->port = tempPdp->port; addr->sa_family = ADHOC_NET; return 0; diff --git a/source/psp/video_hardware_draw.cpp b/source/psp/video_hardware_draw.cpp index 5725b1a..5cc4545 100644 --- a/source/psp/video_hardware_draw.cpp +++ b/source/psp/video_hardware_draw.cpp @@ -488,7 +488,7 @@ qpic_t *Draw_CacheImg (char *path) // the translatable player picture just for the menu // configuration dialog if (!strcmp (path, "gfx/menuplyr.lmp")) - memcpy (menuplyr_pixels, dat->data, dat->width*dat->height); + memcpy(menuplyr_pixels, dat->data, dat->width*dat->height); pic->pic.width = dat->width; pic->pic.height = dat->height; @@ -3398,16 +3398,6 @@ void GL_Upload4(int texture_index, const byte *data, int width, int height) memcpy(texture.palette, data + buffer_size, 16 * 4); int i; - // Copy to VRAM? - /*if (texture.vram) - { - // Copy. - memcpy(texture.vram, texture.ram, buffer_size); - - // Flush the data cache. - sceKernelDcacheWritebackRange(texture.vram, buffer_size); - }*/ - // Flush the data cache. sceKernelDcacheWritebackRange(texture.ram, buffer_size); } diff --git a/source/psp/video_hardware_hlmdl.cpp b/source/psp/video_hardware_hlmdl.cpp index e321c73..ee9dabf 100644 --- a/source/psp/video_hardware_hlmdl.cpp +++ b/source/psp/video_hardware_hlmdl.cpp @@ -130,7 +130,7 @@ qboolean Mod_LoadHLModel (model_t *mod, void *buffer) model = static_cast(Hunk_Alloc(sizeof(hlmodelcache_t))); header = static_cast(Hunk_Alloc(com_filesize)); - memcpy(header, buffer, com_filesize); + memcpy_vfpu(header, buffer, com_filesize); if (header->version != 10) { @@ -175,7 +175,7 @@ qboolean Mod_LoadHLModel (model_t *mod, void *buffer) if (!mod->cache.data) return qfalse; - memcpy (mod->cache.data, model, total); + memcpy_vfpu(mod->cache.data, model, total); Hunk_FreeToLowMark (start); return qtrue; @@ -482,7 +482,7 @@ void HL_SetupBones(hlmodel_t *model) } else { - memcpy(transform_matrix[i], matrix, 12 * sizeof(float)); + memcpy_vfpu(transform_matrix[i], matrix, 12 * sizeof(float)); } } } diff --git a/source/psp/video_hardware_images.cpp b/source/psp/video_hardware_images.cpp index 74fb8e3..6e51743 100644 --- a/source/psp/video_hardware_images.cpp +++ b/source/psp/video_hardware_images.cpp @@ -260,7 +260,7 @@ byte* LoadPCX (FILE *f, int matchwidth, int matchheight) image_width = pcx->xmax+1; image_height = pcx->ymax+1; - memcpy(image_palette, palette, sizeof(palette)); + memcpy_vfpu(image_palette, palette, sizeof(palette)); image_palette_type = PAL_RGB; fclose (f); @@ -304,7 +304,7 @@ byte *LoadWAL (char *name) size = width * height; data = static_cast(malloc(size)); - memcpy(data, (byte *)mt + ofs, size); + memcpy_vfpu(data, (byte *)mt + ofs, size); image_palette_type = PAL_Q2; diff --git a/source/psp/video_hardware_mhex2.cpp b/source/psp/video_hardware_mhex2.cpp index a3b3b14..343a4d7 100644 --- a/source/psp/video_hardware_mhex2.cpp +++ b/source/psp/video_hardware_mhex2.cpp @@ -343,7 +343,7 @@ void GL_MakeAliasModelDisplayListsH2 (model_t *m, aliashdr_t *hdr) int* cmds = static_cast(Hunk_Alloc (numcommands * 4)); paliashdr->commands = (byte *)cmds - (byte *)paliashdr; - memcpy (cmds, commands, numcommands * 4); + memcpy_vfpu(cmds, commands, numcommands * 4); trivertx_t* verts = static_cast(Hunk_Alloc (paliashdr->numposes * paliashdr->poseverts * sizeof(trivertx_t) )); @@ -420,7 +420,7 @@ void GL_MakeAliasModelDisplayLists (model_t *m, aliashdr_t *hdr) cmds = static_cast(Hunk_Alloc (numcommands * 4)); paliashdr->commands = (byte *)cmds - (byte *)paliashdr; - memcpy (cmds, commands, numcommands * 4); + memcpy_vfpu(cmds, commands, numcommands * 4); verts = static_cast(Hunk_Alloc (paliashdr->numposes * paliashdr->poseverts * sizeof(trivertx_t) )); diff --git a/source/psp/video_hardware_model.cpp b/source/psp/video_hardware_model.cpp index e56760b..842a8ad 100644 --- a/source/psp/video_hardware_model.cpp +++ b/source/psp/video_hardware_model.cpp @@ -537,13 +537,13 @@ void Mod_LoadTextures (lump_t *l) loadmodel->textures[i] = tx; - memcpy (tx->name, mt->name, sizeof(tx->name)); + memcpy_vfpu(tx->name, mt->name, sizeof(tx->name)); tx->width = mt->width; tx->height = mt->height; for (j=0 ; joffsets[j] = mt->offsets[j] + sizeof(texture_t) - sizeof(miptex_t); // the pixels immediately follow the structures - memcpy ( tx_pixels, mt+1, pixels); + memcpy_vfpu( tx_pixels, mt+1, pixels); int level = 0; if (r_mipmaps.value > 0) @@ -776,7 +776,7 @@ void Mod_LoadLighting (lump_t *l) return; } loadmodel->lightdata = static_cast(Hunk_AllocName ( l->filelen, loadname)); - memcpy (loadmodel->lightdata, mod_base + l->fileofs, l->filelen); + memcpy_vfpu(loadmodel->lightdata, mod_base + l->fileofs, l->filelen); return; } @@ -816,7 +816,7 @@ void Mod_LoadLighting (lump_t *l) loadmodel->lightdata = static_cast(Hunk_AllocName ( l->filelen*3, litfilename)); in = loadmodel->lightdata + l->filelen*2; // place the file at the end, so it will not be overwritten until the very last write out = loadmodel->lightdata; - memcpy (in, mod_base + l->fileofs, l->filelen); + memcpy_vfpu(in, mod_base + l->fileofs, l->filelen); for (i = 0;i < l->filelen;i++) { d = *in++; @@ -853,7 +853,7 @@ void Mod_HL_LoadLighting (lump_t *l) } loadmodel->lightdata = static_cast(Hunk_AllocName ( l->filelen, loadname)); - memcpy (loadmodel->lightdata, mod_base + l->fileofs, l->filelen); + memcpy_vfpu(loadmodel->lightdata, mod_base + l->fileofs, l->filelen); } /* @@ -871,7 +871,7 @@ void Mod_LoadVisibility (lump_t *l) } loadmodel->visdata = static_cast(Hunk_AllocName ( l->filelen, loadname)); - memcpy (loadmodel->visdata, mod_base + l->fileofs, l->filelen); + memcpy_vfpu(loadmodel->visdata, mod_base + l->fileofs, l->filelen); } /* @@ -980,7 +980,7 @@ void Mod_LoadEntities (lump_t *l) } loadmodel->entities = static_cast(Hunk_AllocName ( l->filelen, entfilename)); - memcpy (loadmodel->entities, mod_base + l->fileofs, l->filelen); + memcpy_vfpu(loadmodel->entities, mod_base + l->fileofs, l->filelen); if (loadmodel->bspversion == HL_BSPVERSION || loadmodel->bspversion == NZP_BSPVERSION) Mod_ParseWadsFromEntityLump(loadmodel->entities); @@ -2324,7 +2324,7 @@ void Mod_LoadAliasModel (model_t *mod, void *buffer) if (!mod->cache.data) return; - memcpy (mod->cache.data, pheader, total); + memcpy_vfpu(mod->cache.data, pheader, total); Hunk_FreeToLowMark (start); } @@ -2496,7 +2496,7 @@ void Mod_LoadH2AliasModel (model_t *mod, void *buffer) Cache_Alloc (&mod->cache, total, loadname); if (!mod->cache.data) return; - memcpy (mod->cache.data, pheader, total); + memcpy_vfpu(mod->cache.data, pheader, total); Hunk_FreeToLowMark (start); } @@ -2650,7 +2650,7 @@ void Mod_LoadQ2AliasModel (model_t *mod, void *buffer) Cache_Alloc (&mod->cache, total, loadname); if (!mod->cache.data) return; - memcpy (mod->cache.data, pheader, total); + memcpy_vfpu(mod->cache.data, pheader, total); Hunk_FreeToLowMark (start); } @@ -3247,7 +3247,7 @@ void Mod_LoadQ3AliasModel (model_t *mod, void *buffer) if (!mod->cache.data) return; - memcpy (header, buffer, com_filesize); + memcpy_vfpu(header, buffer, com_filesize); base = com_filesize; mod->type = mod_md3; diff --git a/source/psp/video_hardware_resample.cpp b/source/psp/video_hardware_resample.cpp index bfe869b..4514880 100644 --- a/source/psp/video_hardware_resample.cpp +++ b/source/psp/video_hardware_resample.cpp @@ -74,7 +74,7 @@ static void Image_Resample32 (void *indata, int inwidth, int inheight,void *outd { inrow = (byte *) indata + inwidth4 * yi; if (yi == oldy + 1) - memcpy(row1, row2, outwidth4); + memcpy_vfpu(row1, row2, outwidth4); else Image_Resample32LerpLine (inrow, row1, inwidth, outwidth); Image_Resample32LerpLine (inrow + inwidth4, row2, inwidth, outwidth); @@ -116,12 +116,12 @@ static void Image_Resample32 (void *indata, int inwidth, int inheight,void *outd { inrow = (byte *) indata + inwidth4 * yi; if (yi == oldy+1) - memcpy(row1, row2, outwidth4); + memcpy_vfpu(row1, row2, outwidth4); else Image_Resample32LerpLine (inrow, row1, inwidth, outwidth); oldy = yi; } - memcpy(out, row1, outwidth4); + memcpy_vfpu(out, row1, outwidth4); } } free(memalloc); @@ -221,7 +221,7 @@ static void Image_Resample24 (void *indata, int inwidth, int inheight, if (yi != oldy) { inrow = (byte *) indata + inwidth3 * yi; if (yi == oldy + 1) - memcpy(row1, row2, outwidth3); + memcpy_vfpu(row1, row2, outwidth3); else Image_Resample24LerpLine (inrow, row1, inwidth, outwidth); Image_Resample24LerpLine (inrow + inwidth3, row2, inwidth, outwidth); @@ -259,12 +259,12 @@ static void Image_Resample24 (void *indata, int inwidth, int inheight, if (yi != oldy) { inrow = (byte *) indata + inwidth3 * yi; if (yi == oldy+1) - memcpy(row1, row2, outwidth3); + memcpy_vfpu(row1, row2, outwidth3); else Image_Resample24LerpLine (inrow, row1, inwidth, outwidth); oldy = yi; } - memcpy(out, row1, outwidth3); + memcpy_vfpu(out, row1, outwidth3); } } free(memalloc); diff --git a/source/snd_mem.c b/source/snd_mem.c index 45b5a95..8af75ea 100644 --- a/source/snd_mem.c +++ b/source/snd_mem.c @@ -232,7 +232,7 @@ void DumpChunks(void) data_p=iff_data; do { - memcpy (str, data_p, 4); + memcpy_vfpu(str, data_p, 4); data_p += 4; iff_chunk_len = GetLittleLong(); Con_Printf ("0x%x : %s (%d)\n", (int)(data_p - 4), str, iff_chunk_len); diff --git a/source/sv_main.c b/source/sv_main.c index c099b3c..2c4918c 100644 --- a/source/sv_main.c +++ b/source/sv_main.c @@ -263,7 +263,7 @@ void SV_ConnectClient (int clientnum) netconnection = client->netconnection; if (sv.loadgame) - memcpy (spawn_parms, client->spawn_parms, sizeof(spawn_parms)); + memcpy_vfpu(spawn_parms, client->spawn_parms, sizeof(spawn_parms)); memset (client, 0, sizeof(*client)); client->netconnection = netconnection; @@ -278,7 +278,7 @@ void SV_ConnectClient (int clientnum) client->privileged = false; if (sv.loadgame) - memcpy (client->spawn_parms, spawn_parms, sizeof(spawn_parms)); + memcpy_vfpu(client->spawn_parms, spawn_parms, sizeof(spawn_parms)); else { // call the progs to get default spawn parms for the new client diff --git a/source/sv_phys.c b/source/sv_phys.c index 2dc009e..39c4aef 100644 --- a/source/sv_phys.c +++ b/source/sv_phys.c @@ -1713,7 +1713,7 @@ trace_t SV_Trace_Toss (edict_t *ent, edict_t *ignore) save_frametime = host_frametime; host_frametime = 0.05; - memcpy (&tempent, ent, sizeof(edict_t)); + memcpy_vfpu(&tempent, ent, sizeof(edict_t)); tent = &tempent; while (1) diff --git a/source/zone.c b/source/zone.c index b27eecb..c64e1ca 100644 --- a/source/zone.c +++ b/source/zone.c @@ -48,6 +48,340 @@ typedef struct void Cache_FreeLow (int new_low_hunk); void Cache_FreeHigh (int new_high_hunk); +void* memcpy_vfpu( void* dst, void* src, unsigned int size ) +{ + u8* src8 = (u8*)src; + u8* dst8 = (u8*)dst; + + // < 8 isn't worth trying any optimisations... + if (size<8) goto bytecopy; + + // < 64 means we don't gain anything from using vfpu... + if (size<64) + { + // Align dst on 4 bytes or just resume if already done + while (((((u32)dst8) & 0x3)!=0) && size) { + *dst8++ = *src8++; + size--; + } + if (size<4) goto bytecopy; + + // We are dst aligned now and >= 4 bytes to copy + u32* src32 = (u32*)src8; + u32* dst32 = (u32*)dst8; + switch(((u32)src8)&0x3) + { + case 0: + while (size&0xC) + { + *dst32++ = *src32++; + size -= 4; + } + if (size==0) return (dst); // fast out + while (size>=16) + { + *dst32++ = *src32++; + *dst32++ = *src32++; + *dst32++ = *src32++; + *dst32++ = *src32++; + size -= 16; + } + if (size==0) return (dst); // fast out + src8 = (u8*)src32; + dst8 = (u8*)dst32; + break; + default: + { + register u32 a, b, c, d; + while (size>=4) + { + a = *src8++; + b = *src8++; + c = *src8++; + d = *src8++; + *dst32++ = (d << 24) | (c << 16) | (b << 8) | a; + size -= 4; + } + if (size==0) return (dst); // fast out + dst8 = (u8*)dst32; + } + break; + } + goto bytecopy; + } + + // Align dst on 16 bytes to gain from vfpu aligned stores + while ((((u32)dst8) & 0xF)!=0 && size) { + *dst8++ = *src8++; + size--; + } + + // We use uncached dst to use VFPU writeback and free cpu cache for src only + u8* udst8 = (u8*)((u32)dst8 | 0x40000000); + // We need the 64 byte aligned address to make sure the dcache is invalidated correctly + u8* dst64a = ((u32)dst8&~0x3F); + // Invalidate the first line that matches up to the dst start + if (size>=64) + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%0)\n" + "addiu %0, %0, 64\n" + "sync\n" + ".set pop\n" + :"+r"(dst64a)); + switch(((u32)src8&0xF)) + { + // src aligned on 16 bytes too? nice! + case 0: + while (size>=64) + { + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%2)\n" // Dcache writeback invalidate + "lv.q c000, 0(%1)\n" + "lv.q c010, 16(%1)\n" + "lv.q c020, 32(%1)\n" + "lv.q c030, 48(%1)\n" + "sync\n" // Wait for allegrex writeback + "sv.q c000, 0(%0), wb\n" + "sv.q c010, 16(%0), wb\n" + "sv.q c020, 32(%0), wb\n" + "sv.q c030, 48(%0), wb\n" + // Lots of variable updates... but get hidden in sv.q latency anyway + "addiu %3, %3, -64\n" + "addiu %2, %2, 64\n" + "addiu %1, %1, 64\n" + "addiu %0, %0, 64\n" + ".set pop\n" // restore assembler option + :"+r"(udst8),"+r"(src8),"+r"(dst64a),"+r"(size) + : + :"memory" + ); + } + if (size>16) + { + // Invalidate the last cache line where the max remaining 63 bytes are + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%0)\n" + "sync\n" + ".set pop\n" // restore assembler option + ::"r"(dst64a)); + while (size>=16) + { + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "lv.q c000, 0(%1)\n" + "sv.q c000, 0(%0), wb\n" + // Lots of variable updates... but get hidden in sv.q latency anyway + "addiu %2, %2, -16\n" + "addiu %1, %1, 16\n" + "addiu %0, %0, 16\n" + ".set pop\n" // restore assembler option + :"+r"(udst8),"+r"(src8),"+r"(size) + : + :"memory" + ); + } + } + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "vflush\n" // Flush VFPU writeback cache + ".set pop\n" // restore assembler option + ); + dst8 = (u8*)((u32)udst8 & ~0x40000000); + break; + // src is only qword unaligned but word aligned? We can at least use ulv.q + case 4: + case 8: + case 12: + while (size>=64) + { + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%2)\n" // Dcache writeback invalidate + "ulv.q c000, 0(%1)\n" + "ulv.q c010, 16(%1)\n" + "ulv.q c020, 32(%1)\n" + "ulv.q c030, 48(%1)\n" + "sync\n" // Wait for allegrex writeback + "sv.q c000, 0(%0), wb\n" + "sv.q c010, 16(%0), wb\n" + "sv.q c020, 32(%0), wb\n" + "sv.q c030, 48(%0), wb\n" + // Lots of variable updates... but get hidden in sv.q latency anyway + "addiu %3, %3, -64\n" + "addiu %2, %2, 64\n" + "addiu %1, %1, 64\n" + "addiu %0, %0, 64\n" + ".set pop\n" // restore assembler option + :"+r"(udst8),"+r"(src8),"+r"(dst64a),"+r"(size) + : + :"memory" + ); + } + if (size>16) + // Invalidate the last cache line where the max remaining 63 bytes are + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%0)\n" + "sync\n" + ".set pop\n" // restore assembler option + ::"r"(dst64a)); + while (size>=16) + { + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "ulv.q c000, 0(%1)\n" + "sv.q c000, 0(%0), wb\n" + // Lots of variable updates... but get hidden in sv.q latency anyway + "addiu %2, %2, -16\n" + "addiu %1, %1, 16\n" + "addiu %0, %0, 16\n" + ".set pop\n" // restore assembler option + :"+r"(udst8),"+r"(src8),"+r"(size) + : + :"memory" + ); + } + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "vflush\n" // Flush VFPU writeback cache + ".set pop\n" // restore assembler option + ); + dst8 = (u8*)((u32)udst8 & ~0x40000000); + break; + // src not aligned? too bad... have to use unaligned reads + default: + while (size>=64) + { + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%2)\n" + + "lwr $8, 0(%1)\n" // + "lwl $8, 3(%1)\n" // $8 = *(s + 0) + "lwr $9, 4(%1)\n" // + "lwl $9, 7(%1)\n" // $9 = *(s + 4) + "lwr $10, 8(%1)\n" // + "lwl $10, 11(%1)\n" // $10 = *(s + 8) + "lwr $11, 12(%1)\n" // + "lwl $11, 15(%1)\n" // $11 = *(s + 12) + "mtv $8, s000\n" + "mtv $9, s001\n" + "mtv $10, s002\n" + "mtv $11, s003\n" + + "lwr $8, 16(%1)\n" + "lwl $8, 19(%1)\n" + "lwr $9, 20(%1)\n" + "lwl $9, 23(%1)\n" + "lwr $10, 24(%1)\n" + "lwl $10, 27(%1)\n" + "lwr $11, 28(%1)\n" + "lwl $11, 31(%1)\n" + "mtv $8, s010\n" + "mtv $9, s011\n" + "mtv $10, s012\n" + "mtv $11, s013\n" + + "lwr $8, 32(%1)\n" + "lwl $8, 35(%1)\n" + "lwr $9, 36(%1)\n" + "lwl $9, 39(%1)\n" + "lwr $10, 40(%1)\n" + "lwl $10, 43(%1)\n" + "lwr $11, 44(%1)\n" + "lwl $11, 47(%1)\n" + "mtv $8, s020\n" + "mtv $9, s021\n" + "mtv $10, s022\n" + "mtv $11, s023\n" + + "lwr $8, 48(%1)\n" + "lwl $8, 51(%1)\n" + "lwr $9, 52(%1)\n" + "lwl $9, 55(%1)\n" + "lwr $10, 56(%1)\n" + "lwl $10, 59(%1)\n" + "lwr $11, 60(%1)\n" + "lwl $11, 63(%1)\n" + "mtv $8, s030\n" + "mtv $9, s031\n" + "mtv $10, s032\n" + "mtv $11, s033\n" + + "sync\n" + "sv.q c000, 0(%0), wb\n" + "sv.q c010, 16(%0), wb\n" + "sv.q c020, 32(%0), wb\n" + "sv.q c030, 48(%0), wb\n" + // Lots of variable updates... but get hidden in sv.q latency anyway + "addiu %3, %3, -64\n" + "addiu %2, %2, 64\n" + "addiu %1, %1, 64\n" + "addiu %0, %0, 64\n" + ".set pop\n" // restore assembler option + :"+r"(udst8),"+r"(src8),"+r"(dst64a),"+r"(size) + : + :"$8","$9","$10","$11","memory" + ); + } + if (size>16) + // Invalidate the last cache line where the max remaining 63 bytes are + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "cache 0x1B, 0(%0)\n" + "sync\n" + ".set pop\n" // restore assembler option + ::"r"(dst64a)); + while (size>=16) + { + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "lwr $8, 0(%1)\n" // + "lwl $8, 3(%1)\n" // $8 = *(s + 0) + "lwr $9, 4(%1)\n" // + "lwl $9, 7(%1)\n" // $9 = *(s + 4) + "lwr $10, 8(%1)\n" // + "lwl $10, 11(%1)\n" // $10 = *(s + 8) + "lwr $11, 12(%1)\n" // + "lwl $11, 15(%1)\n" // $11 = *(s + 12) + "mtv $8, s000\n" + "mtv $9, s001\n" + "mtv $10, s002\n" + "mtv $11, s003\n" + + "sv.q c000, 0(%0), wb\n" + // Lots of variable updates... but get hidden in sv.q latency anyway + "addiu %2, %2, -16\n" + "addiu %1, %1, 16\n" + "addiu %0, %0, 16\n" + ".set pop\n" // restore assembler option + :"+r"(udst8),"+r"(src8),"+r"(size) + : + :"$8","$9","$10","$11","memory" + ); + } + asm(".set push\n" // save assembler option + ".set noreorder\n" // suppress reordering + "vflush\n" // Flush VFPU writeback cache + ".set pop\n" // restore assembler option + ); + dst8 = (u8*)((u32)udst8 & ~0x40000000); + break; + } + +bytecopy: + // Copy the remains byte per byte... + while (size--) + { + *dst8++ = *src8++; + } + + return (dst); +} + /* ============================================================================== @@ -374,7 +708,7 @@ void Hunk_Print (qboolean all) // // print the single block // - memcpy (name, h->name, 8); + memcpy_vfpu(name, h->name, 8); if (all) Con_Printf ("%8p :%8i %8s\n",h, h->size, name); diff --git a/source/zone.h b/source/zone.h index 7655ad9..6445428 100644 --- a/source/zone.h +++ b/source/zone.h @@ -127,5 +127,5 @@ void *Cache_Alloc (cache_user_t *c, int size, char *name); void Cache_Report (void); - +void* memcpy_vfpu(void* dst, void* src, unsigned int size);