/* ** utf8.cpp ** UTF-8 utilities ** **--------------------------------------------------------------------------- ** Copyright 2019 Christoph Oelckers ** All rights reserved. ** ** Redistribution and use in source and binary forms, with or without ** modification, are permitted provided that the following conditions ** are met: ** ** 1. Redistributions of source code must retain the above copyright ** notice, this list of conditions and the following disclaimer. ** 2. Redistributions in binary form must reproduce the above copyright ** notice, this list of conditions and the following disclaimer in the ** documentation and/or other materials provided with the distribution. ** 3. The name of the author may not be used to endorse or promote products ** derived from this software without specific prior written permission. ** ** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES ** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. ** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, ** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **--------------------------------------------------------------------------- ** */ #include #include "tarray.h" //========================================================================== // // // //========================================================================== int utf8_encode(int32_t codepoint, uint8_t *buffer, int *size) { if (codepoint < 0) return -1; else if (codepoint < 0x80) { buffer[0] = (char)codepoint; *size = 1; } else if (codepoint < 0x800) { buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); buffer[1] = 0x80 + ((codepoint & 0x03F)); *size = 2; } else if (codepoint < 0x10000) { buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); buffer[2] = 0x80 + ((codepoint & 0x003F)); *size = 3; } else if (codepoint <= 0x10FFFF) { buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); buffer[3] = 0x80 + ((codepoint & 0x00003F)); *size = 4; } else return -1; return 0; } //========================================================================== // // // //========================================================================== int utf8_decode(const uint8_t *src, int *size) { int c = src[0]; int r; *size = 1; if ((c & 0x80) == 0) { return c; } int c1 = src[1]; if ((c & 0xE0) == 0xC0) { r = ((c & 0x1F) << 6) | c1; if (r >= 128) { *size = 2; return r; } return -1; } int c2 = src[2]; if ((c & 0xF0) == 0xE0) { r = ((c & 0x0F) << 12) | (c1 << 6) | c2; if (r >= 2048 && (r < 55296 || r > 57343)) { *size = 3; return r; } return -1; } int c3 = src[3]; if ((c & 0xF8) == 0xF0) { r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; if (r >= 65536 && r <= 1114111) { *size = 4; return r; } } return -1; } //========================================================================== // // Unicode mapping for the 0x80-0x9f range of the Windows 1252 code page // //========================================================================== uint16_t win1252map[] = { 0x20AC, 0x81 , 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8d , 0x017D, 0x8f , 0x90 , 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9d , 0x017E, 0x0178, }; //========================================================================== // // reads one character from the string. // This can handle both ISO 8859-1/Windows-1252 and UTF-8, as well as mixed strings // between both encodings, which may happen if inconsistent encoding is // used between different files in a mod. // //========================================================================== int GetCharFromString(const uint8_t *&string) { int z; z = *string; if (z < 192) { string++; // Handle Windows 1252 characters if (z >= 128 && z < 160) { return win1252map[z - 128]; } return z; } else { int size = 0; auto chr = utf8_decode(string, &size); if (chr >= 0) { string += size; return chr; } string++; return z; } } //========================================================================== // // convert a potentially mixed-encoded string to pure UTF-8 // this returns a pointer to a static buffer, // assuming that its caller will immediately process the result. // //========================================================================== static TArray UTF8String; const char *MakeUTF8(const char *outline, int *numchars = nullptr) { UTF8String.Clear(); const uint8_t *in = (const uint8_t*)outline; if (numchars) *numchars = 0; while (int chr = GetCharFromString(in)) { int size = 0; uint8_t encode[4]; if (!utf8_encode(chr, encode, &size)) { for (int i = 0; i < size; i++) { UTF8String.Push(encode[i]); } } if (numchars) *numchars++; } UTF8String.Push(0); return UTF8String.Data(); }