From 4cf0d76e8cd759bcca5adb27a1b8007ea186937f Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Tue, 1 Nov 2016 13:33:18 +0100 Subject: [PATCH] - fixed: RapidJSON in ASCII mode cannot handle extended 8 bit character sets and will produce broken data if the input contains some. This means we need to perform the conversion to UTF-8 on ZDoom's side and run RapidJSON in UTF-8 mode. Daedalus triggers this with a 0x85 character which in Windows CP 1252 is the ellipsis (...) The converter will assume ISO-8859-1, though, but cannot do anything with these characters because they map to the font being used here. --- src/p_acs.cpp | 4 ++ src/serializer.cpp | 173 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 160 insertions(+), 17 deletions(-) diff --git a/src/p_acs.cpp b/src/p_acs.cpp index f36aaaf69..d82e6590c 100644 --- a/src/p_acs.cpp +++ b/src/p_acs.cpp @@ -791,6 +791,10 @@ void ACSStringPool::WriteStrings(FSerializer &file, const char *key) const { if (file.BeginObject(nullptr)) { + if (i == 430) + { + int a = 0; + } file("index", i) ("string", entry->Str) ("lockcount", entry->LockCount) diff --git a/src/serializer.cpp b/src/serializer.cpp index 49226ce80..dec81e86f 100644 --- a/src/serializer.cpp +++ b/src/serializer.cpp @@ -67,6 +67,143 @@ char nulspace[1024 * 1024 * 4]; bool save_full = false; // for testing. Should be removed afterward. +int utf8_encode(int32_t codepoint, char *buffer, int *size) +{ + if (codepoint < 0) + return -1; + else if (codepoint < 0x80) + { + buffer[0] = (char)codepoint; + *size = 1; + } + else if (codepoint < 0x800) + { + buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); + buffer[1] = 0x80 + ((codepoint & 0x03F)); + *size = 2; + } + else if (codepoint < 0x10000) + { + buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); + buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); + buffer[2] = 0x80 + ((codepoint & 0x003F)); + *size = 3; + } + else if (codepoint <= 0x10FFFF) + { + buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); + buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); + buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); + buffer[3] = 0x80 + ((codepoint & 0x00003F)); + *size = 4; + } + else + return -1; + + return 0; +} + +int utf8_decode(const char *src, int *size) +{ + int c = src[0] & 255; + int r; + + *size = 1; + if ((c & 0x80) == 0) + { + return c; + } + + int c1 = src[1] & 255; + + if ((c & 0xE0) == 0xC0) + { + r = ((c & 0x1F) << 6) | c1; + if (r >= 128) + { + *size = 2; + return r; + } + return -1; + } + + int c2 = src[2] & 255; + + if ((c & 0xF0) == 0xE0) + { + r = ((c & 0x0F) << 12) | (c1 << 6) | c2; + if (r >= 2048 && (r < 55296 || r > 57343)) + { + *size = 3; + return r; + } + return -1; + } + + int c3 = src[3] & 255; + + if ((c & 0xF8) == 0xF0) + { + r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; + if (r >= 65536 && r <= 1114111) + { + *size = 4; + return r; + } + } + return -1; +} + +static TArray out; +static const char *StringToUnicode(const char *cc, int size = -1) +{ + int ch; + const char *c = cc; + int count = 0; + int count1 = 0; + out.Clear(); + while (ch = (*c++) & 255) + { + count1++; + if (ch >= 128) + { + if (ch < 0x800) count += 2; + else count += 3; + // The source cannot contain 4-byte chars. + } + else count++; + if (count1 == size && size > 0) break; + } + if (count == count1) return cc; // string is pure ASCII. + // we need to convert + out.Resize(count + 1); + out.Last() = 0; + c = cc; + int i = 0; + while (ch = (*c++) & 255) + { + utf8_encode(ch, &out[i], &count1); + i += count1; + } + return &out[0]; +} + +static const char *UnicodeToString(const char *cc) +{ + out.Resize((unsigned)strlen(cc) + 1); + int ndx = 0; + while (*cc != 0) + { + int size; + int c = utf8_decode(cc, &size); + if (c < 0 || c > 255) c = '?'; + out[ndx++] = c; + cc += size; + } + out[ndx] = 0; + return &out[0]; +} + //========================================================================== // // @@ -99,8 +236,8 @@ struct FJSONObject struct FWriter { - typedef rapidjson::Writer > Writer; - typedef rapidjson::PrettyWriter > PrettyWriter; + typedef rapidjson::Writer > Writer; + typedef rapidjson::PrettyWriter > PrettyWriter; Writer *mWriter1; PrettyWriter *mWriter2; @@ -173,14 +310,16 @@ struct FWriter void String(const char *k) { + k = StringToUnicode(k); if (mWriter1) mWriter1->String(k); else if (mWriter2) mWriter2->String(k); } void String(const char *k, int size) { - if (mWriter1) mWriter1->String(k, size); - else if (mWriter2) mWriter2->String(k, size); + k = StringToUnicode(k, size); + if (mWriter1) mWriter1->String(k); + else if (mWriter2) mWriter2->String(k); } void Bool(bool k) @@ -602,7 +741,7 @@ FSerializer &FSerializer::Args(const char *key, int *args, int *defargs, int spe } else if (i == 0 && aval.IsString()) { - args[i] = -FName(aval.GetString()); + args[i] = -FName(UnicodeToString(aval.GetString())); } else { @@ -654,7 +793,7 @@ FSerializer &FSerializer::ScriptNum(const char *key, int &num) } else if (val->IsString()) { - num = -FName(val->GetString()); + num = -FName(UnicodeToString(val->GetString())); } else { @@ -709,7 +848,7 @@ FSerializer &FSerializer::Sprite(const char *key, int32_t &spritenum, int32_t *d { if (val->IsString()) { - uint32_t name = *reinterpret_cast(val->GetString()); + uint32_t name = *reinterpret_cast(UnicodeToString(val->GetString())); for (auto hint = NumStdSprites; hint-- != 0; ) { if (sprites[hint].dwName == name) @@ -747,7 +886,7 @@ FSerializer &FSerializer::StringPtr(const char *key, const char *&charptr) { if (val->IsString()) { - charptr = val->GetString(); + charptr = UnicodeToString(val->GetString()); } else { @@ -1403,7 +1542,7 @@ FSerializer &Serialize(FSerializer &arc, const char *key, FTextureID &value, FTe assert(nameval.IsString() && typeval.IsInt()); if (nameval.IsString() && typeval.IsInt()) { - value = TexMan.GetTexture(nameval.GetString(), typeval.GetInt()); + value = TexMan.GetTexture(UnicodeToString(nameval.GetString()), typeval.GetInt()); } else { @@ -1553,7 +1692,7 @@ FSerializer &Serialize(FSerializer &arc, const char *key, FName &value, FName *d assert(val->IsString()); if (val->IsString()) { - value = val->GetString(); + value = UnicodeToString(val->GetString()); } else { @@ -1638,7 +1777,7 @@ FSerializer &Serialize(FSerializer &arc, const char *key, FSoundID &sid, FSoundI assert(val->IsString() || val->IsNull()); if (val->IsString()) { - sid = val->GetString(); + sid = UnicodeToString(val->GetString()); } else if (val->IsNull()) { @@ -1687,7 +1826,7 @@ template<> FSerializer &Serialize(FSerializer &arc, const char *key, PClassActor assert(val->IsString() || val->IsNull()); if (val->IsString()) { - clst = PClass::FindActor(val->GetString()); + clst = PClass::FindActor(UnicodeToString(val->GetString())); } else if (val->IsNull()) { @@ -1735,7 +1874,7 @@ template<> FSerializer &Serialize(FSerializer &arc, const char *key, PClass *&cl { if (val->IsString()) { - clst = PClass::FindClass(val->GetString()); + clst = PClass::FindClass(UnicodeToString(val->GetString())); } else if (val->IsNull()) { @@ -1810,7 +1949,7 @@ FSerializer &Serialize(FSerializer &arc, const char *key, FState *&state, FState assert(cls.IsString() && ndx.IsUint()); if (cls.IsString() && ndx.IsUint()) { - PClassActor *clas = PClass::FindActor(cls.GetString()); + PClassActor *clas = PClass::FindActor(UnicodeToString(cls.GetString())); if (clas && ndx.GetUint() < (unsigned)clas->NumOwnedStates) { state = clas->OwnedStates + ndx.GetUint(); @@ -1932,7 +2071,7 @@ template<> FSerializer &Serialize(FSerializer &arc, const char *key, FString *&p } else if (val->IsString()) { - pstr = AActor::mStringPropertyData.Alloc(val->GetString()); + pstr = AActor::mStringPropertyData.Alloc(UnicodeToString(val->GetString())); } else { @@ -1974,7 +2113,7 @@ FSerializer &Serialize(FSerializer &arc, const char *key, FString &pstr, FString } else if (val->IsString()) { - pstr = val->GetString(); + pstr = UnicodeToString(val->GetString()); } else { @@ -2023,7 +2162,7 @@ template<> FSerializer &Serialize(FSerializer &arc, const char *key, char *&pstr } else if (val->IsString()) { - pstr = copystring(val->GetString()); + pstr = copystring(UnicodeToString(val->GetString())); } else {