/* ** utf8.cpp ** UTF-8 utilities ** **--------------------------------------------------------------------------- ** Copyright 2019 Christoph Oelckers ** All rights reserved. ** ** Redistribution and use in source and binary forms, with or without ** modification, are permitted provided that the following conditions ** are met: ** ** 1. Redistributions of source code must retain the above copyright ** notice, this list of conditions and the following disclaimer. ** 2. Redistributions in binary form must reproduce the above copyright ** notice, this list of conditions and the following disclaimer in the ** documentation and/or other materials provided with the distribution. ** 3. The name of the author may not be used to endorse or promote products ** derived from this software without specific prior written permission. ** ** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES ** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. ** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, ** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **--------------------------------------------------------------------------- ** */ #include #include "tarray.h" #include "utf8.h" //========================================================================== // // // //========================================================================== int utf8_encode(int32_t codepoint, uint8_t *buffer, int *size) { if (codepoint < 0) return -1; else if (codepoint < 0x80) { buffer[0] = (char)codepoint; *size = 1; } else if (codepoint < 0x800) { buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); buffer[1] = 0x80 + ((codepoint & 0x03F)); *size = 2; } else if (codepoint < 0x10000) { buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); buffer[2] = 0x80 + ((codepoint & 0x003F)); *size = 3; } else if (codepoint <= 0x10FFFF) { buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); buffer[3] = 0x80 + ((codepoint & 0x00003F)); *size = 4; } else return -1; return 0; } //========================================================================== // // // //========================================================================== int utf8_decode(const uint8_t *src, int *size) { int c = src[0]; int r; *size = 1; if ((c & 0x80) == 0) { return c; } int c1 = src[1]; if (c1 < 0x80 || c1 >= 0xc0) return -1; c1 &= 0x3f; if ((c & 0xE0) == 0xC0) { r = ((c & 0x1F) << 6) | c1; if (r >= 128) { *size = 2; return r; } return -1; } int c2 = src[2]; if (c2 < 0x80 || c2 >= 0xc0) return -1; c2 &= 0x3f; if ((c & 0xF0) == 0xE0) { r = ((c & 0x0F) << 12) | (c1 << 6) | c2; if (r >= 2048 && (r < 55296 || r > 57343)) { *size = 3; return r; } return -1; } int c3 = src[3]; if (c3 < 0x80 || c1 >= 0xc0) return -1; c3 &= 0x3f; if ((c & 0xF8) == 0xF0) { r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; if (r >= 65536 && r <= 1114111) { *size = 4; return r; } } return -1; } //========================================================================== // // Unicode mapping for the 0x80-0x9f range of the Windows 1252 code page // //========================================================================== uint16_t win1252map[] = { 0x20AC, 0x81 , 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8d , 0x017D, 0x8f , 0x90 , 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9d , 0x017E, 0x0178, }; //========================================================================== // // reads one character from the string. // This can handle both ISO 8859-1/Windows-1252 and UTF-8, as well as mixed strings // between both encodings, which may happen if inconsistent encoding is // used between different files in a mod. // //========================================================================== int GetCharFromString(const uint8_t *&string) { int z; z = *string; if (z < 192) { string++; // Handle Windows 1252 characters if (z >= 128 && z < 160) { return win1252map[z - 128]; } return z; } else { int size = 0; auto chr = utf8_decode(string, &size); if (chr >= 0) { string += size; return chr; } string++; return z; } } //========================================================================== // // convert a potentially mixed-encoded string to pure UTF-8 // this returns a pointer to a static buffer, // assuming that its caller will immediately process the result. // //========================================================================== static TArray UTF8String; const char *MakeUTF8(const char *outline, int *numchars) { UTF8String.Clear(); const uint8_t *in = (const uint8_t*)outline; if (numchars) *numchars = 0; while (int chr = GetCharFromString(in)) { int size = 0; uint8_t encode[4]; if (!utf8_encode(chr, encode, &size)) { for (int i = 0; i < size; i++) { UTF8String.Push(encode[i]); } } if (numchars) (*numchars)++; } UTF8String.Push(0); return UTF8String.Data(); } const char *MakeUTF8(int codepoint, int *psize) { int size = 0; UTF8String.Resize(5); utf8_encode(codepoint, (uint8_t*)UTF8String.Data(), &size); UTF8String[size] = 0; if (psize) *psize = size; return UTF8String.Data(); } //========================================================================== // // Returns a character without an accent mark (or one with a similar looking accent in some cases where direct support is unlikely.) // //========================================================================== int stripaccent(int code) { if (code < 0x8a) return code; if (code < 0x100) { if (code == 0x8a) // Latin capital letter S with caron return 'S'; if (code == 0x8e) // Latin capital letter Z with caron return 'Z'; if (code == 0x9a) // Latin small letter S with caron return 's'; if (code == 0x9e) // Latin small letter Z with caron return 'z'; if (code == 0x9f) // Latin capital letter Y with diaeresis return 'Y'; if (code == 0xab || code == 0xbb) return '"'; // typographic quotation marks. if (code == 0xff) // Latin small letter Y with diaeresis return 'y'; // Every other accented character has the high two bits set. if ((code & 0xC0) == 0) return code; // Make lowercase characters uppercase so there are half as many tests. int acode = code & 0xDF; if (acode >= 0xC0 && acode <= 0xC5) // A with accents return 'A' + (code & 0x20); if (acode == 0xC7) // Cedilla return 'C' + (acode & 0x20); if (acode >= 0xC8 && acode <= 0xCB) // E with accents return 'E' + (code & 0x20); if (acode >= 0xCC && acode <= 0xCF) // I with accents return 'I' + (code & 0x20); if (acode == 0xD0) // Eth return 'D' + (code & 0x20); if (acode == 0xD1) // N with tilde return 'N' + (code & 0x20); if ((acode >= 0xD2 && acode <= 0xD6) || // O with accents acode == 0xD8) // O with stroke return 'O' + (code & 0x20); if (acode >= 0xD9 && acode <= 0xDC) // U with accents return 'U' + (code & 0x20); if (acode == 0xDD) // Y with accute return 'Y' + (code & 0x20); if (acode == 0xDE) // Thorn return 'P' + (code & 0x20); // well, it sort of looks like a 'P' } else if (code >= 0x100 && code < 0x180) { // For the double-accented Hungarian letters it makes more sense to first map them to the very similar looking Umlauts. // (And screw the crappy specs that do not allow UTF-8 multibyte character literals here.) if (code == 0x150) code = 0xd6; else if (code == 0x151) code = 0xf6; else if (code == 0x170) code = 0xdc; else if (code == 0x171) code = 0xfc; else { static const char accentless[] = "AaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgHhHhIiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnnNnOoOoOoOoRrRrRrSsSsSsSsTtTtTtUuUuUuUuUuUuWwYyYZzZzZzs"; return accentless[code - 0x100]; } } else if (code >= 0x1fc && code < 0x218) { // 0x200-0x217 are irrelevant but easy to map to other characters more likely to exist. static const uint16_t u200map[] = {0xc6, 0xe6, 0xd8, 0xf8, 0xc4, 0xe4, 0xc2, 0xe2, 0xcb, 0xeb, 0xca, 0xea, 0xcf, 0xef, 0xce, 0xee, 0xd6, 0xf6, 0xd4, 0xe4, 'R', 'r', 'R', 'r', 0xdc, 0xfc, 0xdb, 0xfb}; return u200map[code - 0x1fc]; } return getAlternative(code); } //========================================================================== // // Return replacement characters that should not make font completeness tests fail. // //========================================================================== int getAlternative(int code) { switch (code) { default: return code; case '{': return '('; case '}': return ')'; case 0x17f: return 's'; // The 'long s' can be safely remapped to the regular variant, not that this gets used in any real text... case 0x218: return 0x15e; // Romanian S with comma below may get remapped to S with cedilla. case 0x219: return 0x15f; case 0x21a: return 0x162; // Romanian T with comma below may get remapped to T with cedilla. case 0x21b: return 0x163; case 0x386: return 0x391; // Greek characters with accents must map to their base form due to the "no accents in allcaps " rule. case 0x388: return 0x395; case 0x389: return 0x397; case 0x38a: return 0x399; case 0x38c: return 0x39f; case 0x3a0: return 0x41f; case 0x38e: return 0x3a5; case 0x38f: return 0x3a9; case 0x391: return 'A';// Greek characters with equivalents in either Latin or Cyrillic. This is only suitable for uppercase fonts! case 0x392: return 'B'; case 0x393: return 0x413; case 0x395: return 'E'; case 0x396: return 'Z'; case 0x397: return 'H'; case 0x399: return 'I'; case 0x39a: return 'K'; case 0x39c: return 'M'; case 0x39d: return 'N'; case 0x39f: return 'O'; case 0x3a1: return 'P'; case 0x3a4: return 'T'; case 0x3a5: return 'Y'; case 0x3a6: return 0x424; case 0x3a7: return 'X'; case 0x3aa: return 0xcf; case 0x3ab: return 0x178; case 0x3bf: return 'o'; // the Omicron is the only small Greek character that's easily mappable to a Latin equivalent. :( case 0x3c2: return 0x3c3; // Lowercase Sigma character in Greek, which changes depending on its positioning in a word; if the font is uppercase only or features a smallcaps style, the second variant of the letter will remain unused case 0x390: return 0x3ca; // For smallcaps fonts the small accented Greek characters remap to the unaccented versions. case 0x3ac: return 0x3b1; case 0x3ad: return 0x3b5; case 0x3ae: return 0x3b7; case 0x3af: return 0x3b9; case 0x3b0: return 0x3cb; case 0x3cc: return 0x3bf; case 0x3cd: return 0x3c5; case 0x3ce: return 0x3c9; case 0x400: return 0xc8; // Cyrillic characters with equivalents in the Latin alphabet. case 0x401: return 0xcb; case 0x405: return 'S'; case 0x406: return 'I'; case 0x407: return 0xcf; case 0x408: return 'J'; case 0x410: return 'A'; case 0x412: return 'B'; case 0x415: return 'E'; case 0x41a: return 'K'; case 0x41c: return 'M'; case 0x41d: return 'H'; case 0x41e: return 'O'; case 0x420: return 'P'; case 0x421: return 'C'; case 0x422: return 'T'; case 0x425: return 'X'; case 0x430: return 'a'; case 0x435: return 'e'; case 0x43e: return 'o'; case 0x440: return 'p'; case 0x441: return 'c'; case 0x445: return 'x'; case 0x450: return 0xe8; case 0x451: return 0xeb; case 0x455: return 's'; case 0x456: return 'i'; case 0x457: return 0xef; case 0x458: return 'j'; } } //========================================================================== // // Unicode-aware upper/lowercase conversion // The only characters not being handled by this are the Turkish I's // because those are language specific. // //========================================================================== uint16_t lowerforupper[65536]; uint16_t upperforlower[65536]; bool islowermap[65536]; bool isuppermap[65536]; // This is a supposedly complete mapping of all lower <-> upper pairs. Most will most likely never be needed by Doom but this way there won't be any future surprises static const uint16_t loweruppercase[] = { 0x0061,0x0041, 0x0062,0x0042, 0x0063,0x0043, 0x0064,0x0044, 0x0065,0x0045, 0x0066,0x0046, 0x0067,0x0047, 0x0068,0x0048, 0x0069,0x0049, 0x006A,0x004A, 0x006B,0x004B, 0x006C,0x004C, 0x006D,0x004D, 0x006E,0x004E, 0x006F,0x004F, 0x0070,0x0050, 0x0071,0x0051, 0x0072,0x0052, 0x0073,0x0053, 0x0074,0x0054, 0x0075,0x0055, 0x0076,0x0056, 0x0077,0x0057, 0x0078,0x0058, 0x0079,0x0059, 0x007A,0x005A, 0x00DF,0x1E9E, 0x00E0,0x00C0, 0x00E1,0x00C1, 0x00E2,0x00C2, 0x00E3,0x00C3, 0x00E4,0x00C4, 0x00E5,0x00C5, 0x00E6,0x00C6, 0x00E7,0x00C7, 0x00E8,0x00C8, 0x00E9,0x00C9, 0x00EA,0x00CA, 0x00EB,0x00CB, 0x00EC,0x00CC, 0x00ED,0x00CD, 0x00EE,0x00CE, 0x00EF,0x00CF, 0x00F0,0x00D0, 0x00F1,0x00D1, 0x00F2,0x00D2, 0x00F3,0x00D3, 0x00F4,0x00D4, 0x00F5,0x00D5, 0x00F6,0x00D6, 0x00F8,0x00D8, 0x00F9,0x00D9, 0x00FA,0x00DA, 0x00FB,0x00DB, 0x00FC,0x00DC, 0x00FD,0x00DD, 0x00FE,0x00DE, 0x00FF,0x0178, 0x0101,0x0100, 0x0103,0x0102, 0x0105,0x0104, 0x0107,0x0106, 0x0109,0x0108, 0x010B,0x010A, 0x010D,0x010C, 0x010F,0x010E, 0x0111,0x0110, 0x0113,0x0112, 0x0115,0x0114, 0x0117,0x0116, 0x0119,0x0118, 0x011B,0x011A, 0x011D,0x011C, 0x011F,0x011E, 0x0121,0x0120, 0x0123,0x0122, 0x0125,0x0124, 0x0127,0x0126, 0x0129,0x0128, 0x012B,0x012A, 0x012D,0x012C, 0x012F,0x012E, 0x0133,0x0132, 0x0135,0x0134, 0x0137,0x0136, 0x013A,0x0139, 0x013C,0x013B, 0x013E,0x013D, 0x0140,0x013F, 0x0142,0x0141, 0x0144,0x0143, 0x0146,0x0145, 0x0148,0x0147, 0x014B,0x014A, 0x014D,0x014C, 0x014F,0x014E, 0x0151,0x0150, 0x0153,0x0152, 0x0155,0x0154, 0x0157,0x0156, 0x0159,0x0158, 0x015B,0x015A, 0x015D,0x015C, 0x015F,0x015E, 0x0161,0x0160, 0x0163,0x0162, 0x0165,0x0164, 0x0167,0x0166, 0x0169,0x0168, 0x016B,0x016A, 0x016D,0x016C, 0x016F,0x016E, 0x0171,0x0170, 0x0173,0x0172, 0x0175,0x0174, 0x0177,0x0176, 0x017A,0x0179, 0x017C,0x017B, 0x017E,0x017D, 0x0183,0x0182, 0x0185,0x0184, 0x0188,0x0187, 0x018C,0x018B, 0x0192,0x0191, 0x0199,0x0198, 0x01A1,0x01A0, 0x01A3,0x01A2, 0x01A5,0x01A4, 0x01A8,0x01A7, 0x01AD,0x01AC, 0x01B0,0x01AF, 0x01B4,0x01B3, 0x01B6,0x01B5, 0x01B9,0x01B8, 0x01BD,0x01BC, 0x01C6,0x01C4, 0x01C9,0x01C7, 0x01CC,0x01CA, 0x01CE,0x01CD, 0x01D0,0x01CF, 0x01D2,0x01D1, 0x01D4,0x01D3, 0x01D6,0x01D5, 0x01D8,0x01D7, 0x01DA,0x01D9, 0x01DC,0x01DB, 0x01DF,0x01DE, 0x01E1,0x01E0, 0x01E3,0x01E2, 0x01E5,0x01E4, 0x01E7,0x01E6, 0x01E9,0x01E8, 0x01EB,0x01EA, 0x01ED,0x01EC, 0x01EF,0x01EE, 0x01F3,0x01F1, 0x01F5,0x01F4, 0x01FB,0x01FA, 0x01FD,0x01FC, 0x01FF,0x01FE, 0x0201,0x0200, 0x0203,0x0202, 0x0205,0x0204, 0x0207,0x0206, 0x0209,0x0208, 0x020B,0x020A, 0x020D,0x020C, 0x020F,0x020E, 0x0211,0x0210, 0x0213,0x0212, 0x0215,0x0214, 0x0217,0x0216, 0x0253,0x0181, 0x0254,0x0186, 0x0257,0x018A, 0x0258,0x018E, 0x0259,0x018F, 0x025B,0x0190, 0x0260,0x0193, 0x0263,0x0194, 0x0268,0x0197, 0x0269,0x0196, 0x026F,0x019C, 0x0272,0x019D, 0x0275,0x019F, 0x0283,0x01A9, 0x0288,0x01AE, 0x028A,0x01B1, 0x028B,0x01B2, 0x0292,0x01B7, 0x03AC,0x0386, 0x03AD,0x0388, 0x03AE,0x0389, 0x03AF,0x038A, 0x03B1,0x0391, 0x03B2,0x0392, 0x03B3,0x0393, 0x03B4,0x0394, 0x03B5,0x0395, 0x03B6,0x0396, 0x03B7,0x0397, 0x03B8,0x0398, 0x03B9,0x0399, 0x03BA,0x039A, 0x03BB,0x039B, 0x03BC,0x039C, 0x03BD,0x039D, 0x03BE,0x039E, 0x03BF,0x039F, 0x03C0,0x03A0, 0x03C1,0x03A1, 0x03C3,0x03A3, 0x03C4,0x03A4, 0x03C5,0x03A5, 0x03C6,0x03A6, 0x03C7,0x03A7, 0x03C8,0x03A8, 0x03C9,0x03A9, 0x03CA,0x03AA, 0x03CB,0x03AB, 0x03CC,0x038C, 0x03CD,0x038E, 0x03CE,0x038F, 0x03E3,0x03E2, 0x03E5,0x03E4, 0x03E7,0x03E6, 0x03E9,0x03E8, 0x03EB,0x03EA, 0x03ED,0x03EC, 0x03EF,0x03EE, 0x0430,0x0410, 0x0431,0x0411, 0x0432,0x0412, 0x0433,0x0413, 0x0434,0x0414, 0x0435,0x0415, 0x0436,0x0416, 0x0437,0x0417, 0x0438,0x0418, 0x0439,0x0419, 0x043A,0x041A, 0x043B,0x041B, 0x043C,0x041C, 0x043D,0x041D, 0x043E,0x041E, 0x043F,0x041F, 0x0440,0x0420, 0x0441,0x0421, 0x0442,0x0422, 0x0443,0x0423, 0x0444,0x0424, 0x0445,0x0425, 0x0446,0x0426, 0x0447,0x0427, 0x0448,0x0428, 0x0449,0x0429, 0x044A,0x042A, 0x044B,0x042B, 0x044C,0x042C, 0x044D,0x042D, 0x044E,0x042E, 0x044F,0x042F, 0x0451,0x0401, 0x0452,0x0402, 0x0453,0x0403, 0x0454,0x0404, 0x0455,0x0405, 0x0456,0x0406, 0x0457,0x0407, 0x0458,0x0408, 0x0459,0x0409, 0x045A,0x040A, 0x045B,0x040B, 0x045C,0x040C, 0x045E,0x040E, 0x045F,0x040F, 0x0461,0x0460, 0x0463,0x0462, 0x0465,0x0464, 0x0467,0x0466, 0x0469,0x0468, 0x046B,0x046A, 0x046D,0x046C, 0x046F,0x046E, 0x0471,0x0470, 0x0473,0x0472, 0x0475,0x0474, 0x0477,0x0476, 0x0479,0x0478, 0x047B,0x047A, 0x047D,0x047C, 0x047F,0x047E, 0x0481,0x0480, 0x0491,0x0490, 0x0493,0x0492, 0x0495,0x0494, 0x0497,0x0496, 0x0499,0x0498, 0x049B,0x049A, 0x049D,0x049C, 0x049F,0x049E, 0x04A1,0x04A0, 0x04A3,0x04A2, 0x04A5,0x04A4, 0x04A7,0x04A6, 0x04A9,0x04A8, 0x04AB,0x04AA, 0x04AD,0x04AC, 0x04AF,0x04AE, 0x04B1,0x04B0, 0x04B3,0x04B2, 0x04B5,0x04B4, 0x04B7,0x04B6, 0x04B9,0x04B8, 0x04BB,0x04BA, 0x04BD,0x04BC, 0x04BF,0x04BE, 0x04C2,0x04C1, 0x04C4,0x04C3, 0x04C8,0x04C7, 0x04CC,0x04CB, 0x04D1,0x04D0, 0x04D3,0x04D2, 0x04D5,0x04D4, 0x04D7,0x04D6, 0x04D9,0x04D8, 0x04DB,0x04DA, 0x04DD,0x04DC, 0x04DF,0x04DE, 0x04E1,0x04E0, 0x04E3,0x04E2, 0x04E5,0x04E4, 0x04E7,0x04E6, 0x04E9,0x04E8, 0x04EB,0x04EA, 0x04EF,0x04EE, 0x04F1,0x04F0, 0x04F3,0x04F2, 0x04F5,0x04F4, 0x04F9,0x04F8, 0x0561,0x0531, 0x0562,0x0532, 0x0563,0x0533, 0x0564,0x0534, 0x0565,0x0535, 0x0566,0x0536, 0x0567,0x0537, 0x0568,0x0538, 0x0569,0x0539, 0x056A,0x053A, 0x056B,0x053B, 0x056C,0x053C, 0x056D,0x053D, 0x056E,0x053E, 0x056F,0x053F, 0x0570,0x0540, 0x0571,0x0541, 0x0572,0x0542, 0x0573,0x0543, 0x0574,0x0544, 0x0575,0x0545, 0x0576,0x0546, 0x0577,0x0547, 0x0578,0x0548, 0x0579,0x0549, 0x057A,0x054A, 0x057B,0x054B, 0x057C,0x054C, 0x057D,0x054D, 0x057E,0x054E, 0x057F,0x054F, 0x0580,0x0550, 0x0581,0x0551, 0x0582,0x0552, 0x0583,0x0553, 0x0584,0x0554, 0x0585,0x0555, 0x0586,0x0556, 0x10D0,0x10A0, 0x10D1,0x10A1, 0x10D2,0x10A2, 0x10D3,0x10A3, 0x10D4,0x10A4, 0x10D5,0x10A5, 0x10D6,0x10A6, 0x10D7,0x10A7, 0x10D8,0x10A8, 0x10D9,0x10A9, 0x10DA,0x10AA, 0x10DB,0x10AB, 0x10DC,0x10AC, 0x10DD,0x10AD, 0x10DE,0x10AE, 0x10DF,0x10AF, 0x10E0,0x10B0, 0x10E1,0x10B1, 0x10E2,0x10B2, 0x10E3,0x10B3, 0x10E4,0x10B4, 0x10E5,0x10B5, 0x10E6,0x10B6, 0x10E7,0x10B7, 0x10E8,0x10B8, 0x10E9,0x10B9, 0x10EA,0x10BA, 0x10EB,0x10BB, 0x10EC,0x10BC, 0x10ED,0x10BD, 0x10EE,0x10BE, 0x10EF,0x10BF, 0x10F0,0x10C0, 0x10F1,0x10C1, 0x10F2,0x10C2, 0x10F3,0x10C3, 0x10F4,0x10C4, 0x10F5,0x10C5, 0x1E01,0x1E00, 0x1E03,0x1E02, 0x1E05,0x1E04, 0x1E07,0x1E06, 0x1E09,0x1E08, 0x1E0B,0x1E0A, 0x1E0D,0x1E0C, 0x1E0F,0x1E0E, 0x1E11,0x1E10, 0x1E13,0x1E12, 0x1E15,0x1E14, 0x1E17,0x1E16, 0x1E19,0x1E18, 0x1E1B,0x1E1A, 0x1E1D,0x1E1C, 0x1E1F,0x1E1E, 0x1E21,0x1E20, 0x1E23,0x1E22, 0x1E25,0x1E24, 0x1E27,0x1E26, 0x1E29,0x1E28, 0x1E2B,0x1E2A, 0x1E2D,0x1E2C, 0x1E2F,0x1E2E, 0x1E31,0x1E30, 0x1E33,0x1E32, 0x1E35,0x1E34, 0x1E37,0x1E36, 0x1E39,0x1E38, 0x1E3B,0x1E3A, 0x1E3D,0x1E3C, 0x1E3F,0x1E3E, 0x1E41,0x1E40, 0x1E43,0x1E42, 0x1E45,0x1E44, 0x1E47,0x1E46, 0x1E49,0x1E48, 0x1E4B,0x1E4A, 0x1E4D,0x1E4C, 0x1E4F,0x1E4E, 0x1E51,0x1E50, 0x1E53,0x1E52, 0x1E55,0x1E54, 0x1E57,0x1E56, 0x1E59,0x1E58, 0x1E5B,0x1E5A, 0x1E5D,0x1E5C, 0x1E5F,0x1E5E, 0x1E61,0x1E60, 0x1E63,0x1E62, 0x1E65,0x1E64, 0x1E67,0x1E66, 0x1E69,0x1E68, 0x1E6B,0x1E6A, 0x1E6D,0x1E6C, 0x1E6F,0x1E6E, 0x1E71,0x1E70, 0x1E73,0x1E72, 0x1E75,0x1E74, 0x1E77,0x1E76, 0x1E79,0x1E78, 0x1E7B,0x1E7A, 0x1E7D,0x1E7C, 0x1E7F,0x1E7E, 0x1E81,0x1E80, 0x1E83,0x1E82, 0x1E85,0x1E84, 0x1E87,0x1E86, 0x1E89,0x1E88, 0x1E8B,0x1E8A, 0x1E8D,0x1E8C, 0x1E8F,0x1E8E, 0x1E91,0x1E90, 0x1E93,0x1E92, 0x1E95,0x1E94, 0x1EA1,0x1EA0, 0x1EA3,0x1EA2, 0x1EA5,0x1EA4, 0x1EA7,0x1EA6, 0x1EA9,0x1EA8, 0x1EAB,0x1EAA, 0x1EAD,0x1EAC, 0x1EAF,0x1EAE, 0x1EB1,0x1EB0, 0x1EB3,0x1EB2, 0x1EB5,0x1EB4, 0x1EB7,0x1EB6, 0x1EB9,0x1EB8, 0x1EBB,0x1EBA, 0x1EBD,0x1EBC, 0x1EBF,0x1EBE, 0x1EC1,0x1EC0, 0x1EC3,0x1EC2, 0x1EC5,0x1EC4, 0x1EC7,0x1EC6, 0x1EC9,0x1EC8, 0x1ECB,0x1ECA, 0x1ECD,0x1ECC, 0x1ECF,0x1ECE, 0x1ED1,0x1ED0, 0x1ED3,0x1ED2, 0x1ED5,0x1ED4, 0x1ED7,0x1ED6, 0x1ED9,0x1ED8, 0x1EDB,0x1EDA, 0x1EDD,0x1EDC, 0x1EDF,0x1EDE, 0x1EE1,0x1EE0, 0x1EE3,0x1EE2, 0x1EE5,0x1EE4, 0x1EE7,0x1EE6, 0x1EE9,0x1EE8, 0x1EEB,0x1EEA, 0x1EED,0x1EEC, 0x1EEF,0x1EEE, 0x1EF1,0x1EF0, 0x1EF3,0x1EF2, 0x1EF5,0x1EF4, 0x1EF7,0x1EF6, 0x1EF9,0x1EF8, 0x1F00,0x1F08, 0x1F01,0x1F09, 0x1F02,0x1F0A, 0x1F03,0x1F0B, 0x1F04,0x1F0C, 0x1F05,0x1F0D, 0x1F06,0x1F0E, 0x1F07,0x1F0F, 0x1F10,0x1F18, 0x1F11,0x1F19, 0x1F12,0x1F1A, 0x1F13,0x1F1B, 0x1F14,0x1F1C, 0x1F15,0x1F1D, 0x1F20,0x1F28, 0x1F21,0x1F29, 0x1F22,0x1F2A, 0x1F23,0x1F2B, 0x1F24,0x1F2C, 0x1F25,0x1F2D, 0x1F26,0x1F2E, 0x1F27,0x1F2F, 0x1F30,0x1F38, 0x1F31,0x1F39, 0x1F32,0x1F3A, 0x1F33,0x1F3B, 0x1F34,0x1F3C, 0x1F35,0x1F3D, 0x1F36,0x1F3E, 0x1F37,0x1F3F, 0x1F40,0x1F48, 0x1F41,0x1F49, 0x1F42,0x1F4A, 0x1F43,0x1F4B, 0x1F44,0x1F4C, 0x1F45,0x1F4D, 0x1F51,0x1F59, 0x1F53,0x1F5B, 0x1F55,0x1F5D, 0x1F57,0x1F5F, 0x1F60,0x1F68, 0x1F61, 0x1F69, 0x1F62, 0x1F6A, 0x1F63, 0x1F6B, 0x1F64, 0x1F6C, 0x1F65, 0x1F6D, 0x1F66, 0x1F6E, 0x1F67, 0x1F6F, 0x1F80, 0x1F88, 0x1F81, 0x1F89, 0x1F82, 0x1F8A, 0x1F83, 0x1F8B, 0x1F84, 0x1F8C, 0x1F85, 0x1F8D, 0x1F86, 0x1F8E, 0x1F87, 0x1F8F, 0x1F90, 0x1F98, 0x1F91, 0x1F99, 0x1F92, 0x1F9A, 0x1F93, 0x1F9B, 0x1F94, 0x1F9C, 0x1F95, 0x1F9D, 0x1F96, 0x1F9E, 0x1F97, 0x1F9F, 0x1FA0, 0x1FA8, 0x1FA1, 0x1FA9, 0x1FA2, 0x1FAA, 0x1FA3, 0x1FAB, 0x1FA4, 0x1FAC, 0x1FA5, 0x1FAD, 0x1FA6, 0x1FAE, 0x1FA7, 0x1FAF, 0x1FB0, 0x1FB8, 0x1FB1, 0x1FB9, 0x1FD0, 0x1FD8, 0x1FD1, 0x1FD9, 0x1FE0, 0x1FE8, 0x1FE1, 0x1FE9, 0x24D0, 0x24B6, 0x24D1, 0x24B7, 0x24D2, 0x24B8, 0x24D3, 0x24B9, 0x24D4, 0x24BA, 0x24D5, 0x24BB, 0x24D6, 0x24BC, 0x24D7, 0x24BD, 0x24D8, 0x24BE, 0x24D9, 0x24BF, 0x24DA, 0x24C0, 0x24DB, 0x24C1, 0x24DC, 0x24C2, 0x24DD, 0x24C3, 0x24DE, 0x24C4, 0x24DF, 0x24C5, 0x24E0, 0x24C6, 0x24E1, 0x24C7, 0x24E2, 0x24C8, 0x24E3, 0x24C9, 0x24E4, 0x24CA, 0x24E5, 0x24CB, 0x24E6, 0x24CC, 0x24E7, 0x24CD, 0x24E8, 0x24CE, 0x24E9, 0x24CF, 0xFF41, 0xFF21, 0xFF42, 0xFF22, 0xFF43, 0xFF23, 0xFF44, 0xFF24, 0xFF45, 0xFF25, 0xFF46, 0xFF26, 0xFF47, 0xFF27, 0xFF48, 0xFF28, 0xFF49, 0xFF29, 0xFF4A, 0xFF2A, 0xFF4B, 0xFF2B, 0xFF4C, 0xFF2C, 0xFF4D, 0xFF2D, 0xFF4E, 0xFF2E, 0xFF4F, 0xFF2F, 0xFF50, 0xFF30, 0xFF51, 0xFF31, 0xFF52, 0xFF32, 0xFF53, 0xFF33, 0xFF54, 0xFF34, 0xFF55, 0xFF35, 0xFF56, 0xFF36, 0xFF57, 0xFF37, 0xFF58, 0xFF38, 0xFF59, 0xFF39, 0xFF5A, 0xFF3A, 0, 0 }; struct InitLowerUpper { InitLowerUpper() { for (int i = 0; i < 65536; i++) { lowerforupper[i] = i; upperforlower[i] = i; } for (int i = 0; loweruppercase[i]; i += 2) { auto lower = loweruppercase[i]; auto upper = loweruppercase[i + 1]; if (lowerforupper[upper] == upper) lowerforupper[upper] = lower; // This mapping is ambiguous so only pick the first match. if (upperforlower[lower] == lower) upperforlower[lower] = upper; isuppermap[upper] = islowermap[lower] = true; } // Special treatment for the two variants of the small sigma in Greek. islowermap[0x3c2] = true; upperforlower[0x3c2] = 0x3a3; // Turkish 'I's. upperforlower[0x131] = 'I'; lowerforupper[0x130] = 'i'; islowermap[0x131] = true; isuppermap[0x130] = true; } }; static InitLowerUpper initer; bool myislower(int code) { if (code >= 0 && code < 65536) return islowermap[code]; return false; } bool myisupper(int code) { if (code >= 0 && code < 65536) return isuppermap[code]; return false; } std::wstring WideString(const char* cin) { std::wstring buildbuffer; if (cin) { // This is a bit tricky because we need to support both UTF-8 and legacy content in ISO-8859-1 / Windows 1252 // and thanks to user-side string manipulation it can be that a text mixes both. // To convert the string this uses the same function as all text printing in the engine. const uint8_t* in = (const uint8_t*)cin; while (*in) buildbuffer.push_back((wchar_t)GetCharFromString(in)); } return buildbuffer; }