From 6dc36c6175eb67b47a1de29ff08a9f64c6d0fcf0 Mon Sep 17 00:00:00 2001 From: Daniel Gibson Date: Sun, 2 Jun 2024 13:14:30 +0200 Subject: [PATCH] Add some functions to handle UTF-8 strings - convert to/from ISO8859-1 (Doom3's "High ASCII" encoding) - count Unicode codepoints in UTF-8 string - cut UTF-8 string off after N codepoints - use the conversion function to replace iconv in sys/events.cpp --- neo/idlib/Str.cpp | 168 +++++++++++++++++++++++++++++++++++++++++++++ neo/idlib/Str.h | 21 ++++++ neo/sys/events.cpp | 54 +-------------- 3 files changed, 192 insertions(+), 51 deletions(-) diff --git a/neo/idlib/Str.cpp b/neo/idlib/Str.cpp index 6595c76c..6bd89617 100644 --- a/neo/idlib/Str.cpp +++ b/neo/idlib/Str.cpp @@ -1880,3 +1880,171 @@ int D3_snprintfC99(char *dst, size_t size, const char *format, ...) va_end( argptr ); return ret; } + + +// convert UTF-8 to ISU8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) +// invalidChar is inserted into the output buffer for unicode characters that can't be +// represented by ISO8859-1; if it's 0, those will just be skipped +// returns NULL on error (need more than n chars in isobuf or invalid encoding in utf8str) +// based on stb_from_utf8 from https://github.com/nothings/stb/blob/master/deprecated/stb.h#L1010 +char * D3_UTF8toISO8859_1( const char *utf8str, char *isobuf, int isobufLen, char invalidChar ) +{ + const unsigned char *str = (const unsigned char *) utf8str; + unsigned char* buffer = (unsigned char *)isobuf; + unsigned c; + int i=0; + int n = isobufLen - 1; + while (*str) { + if (i >= n) + return NULL; + if (!(*str & 0x80)) // ASCII char => copy directly + buffer[i++] = *str++; + else if ((*str & 0xe0) == 0xc0) { + // Unicode character between 0x0080 and 0x07FF + // => might be representable by ISO8859-1 + if (*str < 0xc2) return NULL; + c = (*str++ & 0x1f) << 6; + if ((*str & 0xc0) != 0x80) return NULL; + c += (*str++ & 0x3f); + if ( c < 0xFF ) + buffer[i++] = c; + else if ( invalidChar != 0 ) + buffer[i++] = invalidChar; + } else if ((*str & 0xf0) == 0xe0) { + // Unicode character between 0x0800 and 0xFFF => way out of range for ISO8859-1 + // so just validate and skip the input bytes + if (*str == 0xe0 && (str[1] < 0xa0 || str[1] > 0xbf)) return NULL; + if (*str == 0xed && str[1] > 0x9f) return NULL; // str[1] < 0x80 is checked below + //c = (*str++ & 0x0f) << 12; + ++str; + if ((*str & 0xc0) != 0x80) return NULL; + //c += (*str++ & 0x3f) << 6; + ++str; + if ((*str & 0xc0) != 0x80) return NULL; + //buffer[i++] = c + (*str++ & 0x3f); + ++str; + if ( invalidChar != 0 ) + buffer[i++] = invalidChar; + } else if ((*str & 0xf8) == 0xf0) { + // Unicode character between 0x010000 and 0x10FFFF => even more out of range + // again, validate and skip + if (*str > 0xf4) return NULL; + if (*str == 0xf0 && (str[1] < 0x90 || str[1] > 0xbf)) return NULL; + if (*str == 0xf4 && str[1] > 0x8f) return NULL; // str[1] < 0x80 is checked below + c = (*str++ & 0x07) << 18; + if ((*str & 0xc0) != 0x80) return NULL; + c += (*str++ & 0x3f) << 12; + if ((*str & 0xc0) != 0x80) return NULL; + c += (*str++ & 0x3f) << 6; + if ((*str & 0xc0) != 0x80) return NULL; + c += (*str++ & 0x3f); + // utf-8 encodings of values used in surrogate pairs are invalid + if ((c & 0xFFFFF800) == 0xD800) return NULL; + if ( invalidChar != 0 ) + buffer[i++] = invalidChar; + } else + return NULL; + } + buffer[i] = '\0'; + return isobuf; +} + +// convert ISO8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) to UTF-8 +// returns NULL on error (need more than utf8bufLen chars in utf8buf) +// based on stb_to_utf8 from https://github.com/nothings/stb/blob/master/deprecated/stb.h#L1060 +char * D3_ISO8859_1toUTF8( const char* isoStr, char *utf8buf, int utf8bufLen ) +{ + const unsigned char *str = (const unsigned char *)isoStr; + unsigned char *buffer = (unsigned char *)utf8buf; + int i=0; + int n = utf8bufLen - 1; + while (*str) { + if (i >= n) + return NULL; + if (*str < 0x80) { + buffer[i++] = *str++; + } else { + buffer[i++] = 0xc0 + (*str >> 6); + buffer[i++] = 0x80 + (*str & 0x3f); + ++str; + } + } + buffer[i] = '\0'; + return utf8buf; +} + +// returns number of Unicode codepoints (UTF32 char) in given UTF-8 string. +// if n >= 0, it only looks at the first n bytes of str (but still stops at the first \0) +// that's not necessarily the number of printed characters (as unicode allows graphemes that +// consist of multiple codepoints), but for our purposes (limiting to Latin1 subset) it is.. +// based on utf8nlen from https://github.com/sheredom/utf8.h/blob/master/utf8.h +size_t D3_UTF8CountCodepoints( const char *str, size_t n ) +{ + const char *t = str; + size_t length = 0; + if ( n == (size_t)-1 ) { + n = strlen( str ); + } + + while ((size_t)(str - t) < n && '\0' != *str) { + if (0xf0 == (0xf8 & *str)) { + /* 4-byte utf8 code point (began with 0b11110xxx) */ + str += 4; + } else if (0xe0 == (0xf0 & *str)) { + /* 3-byte utf8 code point (began with 0b1110xxxx) */ + str += 3; + } else if (0xc0 == (0xe0 & *str)) { + /* 2-byte utf8 code point (began with 0b110xxxxx) */ + str += 2; + } else { /* if (0x00 == (0x80 & *s)) { */ + /* 1-byte ascii (began with 0b0xxxxxxx) */ + str += 1; + } + + /* no matter the bytes we marched s forward by, it was + * only 1 utf8 codepoint */ + length++; + } + + if ((size_t)(str - t) > n) { + length--; + } + return length; +} + +// cuts off str (by writing \0 char) after n Unicode codepoints +// returns number of bytes that remain in string => returns strlen(str) (after cutting off) +// if str contains <= n codepoints, it's not modified and the number of bytes in it +// is still returned (excluding terminating \0) +// based on utf8nlen from https://github.com/sheredom/utf8.h/blob/master/utf8.h +size_t D3_UTF8CutOffAfterNCodepoints( char *str, size_t n ) +{ + const char *t = str; + size_t length = 0; + + while ('\0' != *str) { + if (0xf0 == (0xf8 & *str)) { + /* 4-byte utf8 code point (began with 0b11110xxx) */ + str += 4; + } else if (0xe0 == (0xf0 & *str)) { + /* 3-byte utf8 code point (began with 0b1110xxxx) */ + str += 3; + } else if (0xc0 == (0xe0 & *str)) { + /* 2-byte utf8 code point (began with 0b110xxxxx) */ + str += 2; + } else { /* if (0x00 == (0x80 & *s)) { */ + /* 1-byte ascii (began with 0b0xxxxxxx) */ + str += 1; + } + + /* no matter the bytes we marched s forward by, it was + * only 1 utf8 codepoint */ + length++; + /* if we have reached the desired amount of codepoints, cut the rest off */ + if ( length == n ) { + *str = '\0'; + break; + } + } + return (size_t)(str - t); +} diff --git a/neo/idlib/Str.h b/neo/idlib/Str.h index 4b799a47..79b0aa83 100644 --- a/neo/idlib/Str.h +++ b/neo/idlib/Str.h @@ -1082,4 +1082,25 @@ int D3_snprintfC99(char *dst, size_t size, const char *format, ...) id_attribute // unlike idStr::vsnPrintf() which returns -1 in that case int D3_vsnprintfC99(char *dst, size_t size, const char *format, va_list ap); +// convert UTF-8 to ISU8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) +// invalidChar is inserted into the output buffer for unicode characters that can't be +// represented by ISO8859-1; if it's 0, those will just be skipped +char * D3_UTF8toISO8859_1( const char *utf8str, char *isobuf, int n, char invalidChar=0 ); + +// convert ISO8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) to UTF-8 +// returns NULL on error (need more than utf8bufLen chars in utf8buf) +char * D3_ISO8859_1toUTF8( const char* isoStr, char *utf8buf, int utf8bufLen ); + +// returns number of Unicode codepoints (UTF32 char) in given UTF-8 string. +// if n >= 0, it only looks at the first n bytes of str (but still stops at the first \0) +// that's not necessarily the number of printed characters (as unicode allows graphemes that +// consist of multiple codepoints), but for our purposes (limiting to Latin1 subset) it is.. +size_t D3_UTF8CountCodepoints( const char *str, size_t n = -1 ); + +// cuts off str (by writing \0 char) after n Unicode codepoints +// returns number of bytes that remain in string => returns strlen(str) (after cutting off) +// if str contains <= n codepoints, it's not modified and the number of bytes in it +// is still returned (excluding terminating \0) +size_t D3_UTF8CutOffAfterNCodepoints( char *str, size_t n ); + #endif /* !__STR_H__ */ diff --git a/neo/sys/events.cpp b/neo/sys/events.cpp index 253ad4dd..d26bb71c 100644 --- a/neo/sys/events.cpp +++ b/neo/sys/events.cpp @@ -143,11 +143,6 @@ static float joyAxis[MAX_JOYSTICK_AXIS]; static idList event_overflow; -#if SDL_VERSION_ATLEAST(2, 0, 0) -// for utf8ToISO8859_1() - used for non-ascii text input and Sys_GetLocalizedScancodeName() -static SDL_iconv_t iconvDesc = (SDL_iconv_t)-1; -#endif - struct scancodename_t { int sdlScancode; const char* name; @@ -243,39 +238,6 @@ static bool isAscii( const char* str_ ) { } return true; } - -// convert inbuf (which is expected to be in UTF-8) to outbuf (in ISO-8859-1) -static bool utf8ToISO8859_1(const char* inbuf, char* outbuf, size_t outsize) { - if ( iconvDesc == (SDL_iconv_t)-1 ) { - return false; - } - - size_t outbytesleft = outsize; - size_t inbytesleft = strlen( inbuf ) + 1; // + terminating \0 - size_t ret = SDL_iconv( iconvDesc, &inbuf, &inbytesleft, &outbuf, &outbytesleft ); - - while(inbytesleft > 0) { - switch ( ret ) { - case SDL_ICONV_E2BIG: - outbuf[outbytesleft-1] = '\0'; // whatever, just cut it off.. - common->DPrintf( "Cutting off UTF-8 to ISO-8859-1 conversion to '%s' because destination is too small for '%s'\n", outbuf, inbuf ); - SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion - return true; - case SDL_ICONV_EILSEQ: - // try skipping invalid input data - ++inbuf; - --inbytesleft; - break; - case SDL_ICONV_EINVAL: - case SDL_ICONV_ERROR: - // we can't recover from this - SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion - return false; - } - } - SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion - return outbytesleft < outsize; // return false if no char was written -} #endif // SDL2 // start button isn't bindable, but I want to use its name in the imgui-based menu @@ -471,7 +433,8 @@ static const char* getLocalizedScancodeName( int key, bool useUtf8 ) } static char isoName[32]; // try to convert name to ISO8859-1 (Doom3's supported "High ASCII") - if ( utf8ToISO8859_1( ret, isoName, sizeof(isoName) ) && isoName[0] != '\0' ) { + // TODO: pass '?' as invalidChar? + if ( D3_UTF8toISO8859_1( ret, isoName, sizeof(isoName) ) && isoName[0] != '\0' ) { return isoName; } } @@ -876,13 +839,6 @@ void Sys_InitInput() { #if !SDL_VERSION_ATLEAST(2, 0, 0) SDL_EnableUNICODE(1); SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, SDL_DEFAULT_REPEAT_INTERVAL); - -#else // SDL2 - for utf8ToISO8859_1() (non-ascii text input and key naming) - assert(iconvDesc == (SDL_iconv_t)-1); - iconvDesc = SDL_iconv_open( "ISO-8859-1", "UTF-8" ); - if( iconvDesc == (SDL_iconv_t)-1 ) { - common->Warning( "Sys_SetInput(): iconv_open( \"ISO-8859-1\", \"UTF-8\" ) failed! Can't translate non-ascii input!\n" ); - } #endif in_kbd.SetModified(); @@ -932,10 +888,6 @@ void Sys_ShutdownInput() { mouse_polls.Clear(); joystick_polls.Clear(); event_overflow.Clear(); -#if SDL_VERSION_ATLEAST(2, 0, 0) - SDL_iconv_close( iconvDesc ); // used by utf8ToISO8859_1() - iconvDesc = ( SDL_iconv_t ) -1; -#endif } /* @@ -1326,7 +1278,7 @@ sysEvent_t Sys_GetEvent() { s_pos = 1; // pos 0 is returned } return res; - } else if( utf8ToISO8859_1( ev.text.text, s, sizeof(s) ) && s[0] != '\0' ) { + } else if( D3_UTF8toISO8859_1( ev.text.text, s, sizeof(s) ) && s[0] != '\0' ) { res.evValue = (unsigned char)s[0]; if ( s[1] == '\0' ) { s_pos = 0;