Add some functions to handle UTF-8 strings

- convert to/from ISO8859-1 (Doom3's "High ASCII" encoding)
- count Unicode codepoints in UTF-8 string
- cut UTF-8 string off after N codepoints
- use the conversion function to replace iconv in sys/events.cpp
This commit is contained in:
Daniel Gibson 2024-06-02 13:14:30 +02:00
parent 30e4a9bb51
commit 6dc36c6175
3 changed files with 192 additions and 51 deletions

View file

@ -1880,3 +1880,171 @@ int D3_snprintfC99(char *dst, size_t size, const char *format, ...)
va_end( argptr );
return ret;
}
// convert UTF-8 to ISU8859-1 (the "High ASCII" 8-bit encoding Doom3 uses)
// invalidChar is inserted into the output buffer for unicode characters that can't be
// represented by ISO8859-1; if it's 0, those will just be skipped
// returns NULL on error (need more than n chars in isobuf or invalid encoding in utf8str)
// based on stb_from_utf8 from https://github.com/nothings/stb/blob/master/deprecated/stb.h#L1010
char * D3_UTF8toISO8859_1( const char *utf8str, char *isobuf, int isobufLen, char invalidChar )
{
const unsigned char *str = (const unsigned char *) utf8str;
unsigned char* buffer = (unsigned char *)isobuf;
unsigned c;
int i=0;
int n = isobufLen - 1;
while (*str) {
if (i >= n)
return NULL;
if (!(*str & 0x80)) // ASCII char => copy directly
buffer[i++] = *str++;
else if ((*str & 0xe0) == 0xc0) {
// Unicode character between 0x0080 and 0x07FF
// => might be representable by ISO8859-1
if (*str < 0xc2) return NULL;
c = (*str++ & 0x1f) << 6;
if ((*str & 0xc0) != 0x80) return NULL;
c += (*str++ & 0x3f);
if ( c < 0xFF )
buffer[i++] = c;
else if ( invalidChar != 0 )
buffer[i++] = invalidChar;
} else if ((*str & 0xf0) == 0xe0) {
// Unicode character between 0x0800 and 0xFFF => way out of range for ISO8859-1
// so just validate and skip the input bytes
if (*str == 0xe0 && (str[1] < 0xa0 || str[1] > 0xbf)) return NULL;
if (*str == 0xed && str[1] > 0x9f) return NULL; // str[1] < 0x80 is checked below
//c = (*str++ & 0x0f) << 12;
++str;
if ((*str & 0xc0) != 0x80) return NULL;
//c += (*str++ & 0x3f) << 6;
++str;
if ((*str & 0xc0) != 0x80) return NULL;
//buffer[i++] = c + (*str++ & 0x3f);
++str;
if ( invalidChar != 0 )
buffer[i++] = invalidChar;
} else if ((*str & 0xf8) == 0xf0) {
// Unicode character between 0x010000 and 0x10FFFF => even more out of range
// again, validate and skip
if (*str > 0xf4) return NULL;
if (*str == 0xf0 && (str[1] < 0x90 || str[1] > 0xbf)) return NULL;
if (*str == 0xf4 && str[1] > 0x8f) return NULL; // str[1] < 0x80 is checked below
c = (*str++ & 0x07) << 18;
if ((*str & 0xc0) != 0x80) return NULL;
c += (*str++ & 0x3f) << 12;
if ((*str & 0xc0) != 0x80) return NULL;
c += (*str++ & 0x3f) << 6;
if ((*str & 0xc0) != 0x80) return NULL;
c += (*str++ & 0x3f);
// utf-8 encodings of values used in surrogate pairs are invalid
if ((c & 0xFFFFF800) == 0xD800) return NULL;
if ( invalidChar != 0 )
buffer[i++] = invalidChar;
} else
return NULL;
}
buffer[i] = '\0';
return isobuf;
}
// convert ISO8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) to UTF-8
// returns NULL on error (need more than utf8bufLen chars in utf8buf)
// based on stb_to_utf8 from https://github.com/nothings/stb/blob/master/deprecated/stb.h#L1060
char * D3_ISO8859_1toUTF8( const char* isoStr, char *utf8buf, int utf8bufLen )
{
const unsigned char *str = (const unsigned char *)isoStr;
unsigned char *buffer = (unsigned char *)utf8buf;
int i=0;
int n = utf8bufLen - 1;
while (*str) {
if (i >= n)
return NULL;
if (*str < 0x80) {
buffer[i++] = *str++;
} else {
buffer[i++] = 0xc0 + (*str >> 6);
buffer[i++] = 0x80 + (*str & 0x3f);
++str;
}
}
buffer[i] = '\0';
return utf8buf;
}
// returns number of Unicode codepoints (UTF32 char) in given UTF-8 string.
// if n >= 0, it only looks at the first n bytes of str (but still stops at the first \0)
// that's not necessarily the number of printed characters (as unicode allows graphemes that
// consist of multiple codepoints), but for our purposes (limiting to Latin1 subset) it is..
// based on utf8nlen from https://github.com/sheredom/utf8.h/blob/master/utf8.h
size_t D3_UTF8CountCodepoints( const char *str, size_t n )
{
const char *t = str;
size_t length = 0;
if ( n == (size_t)-1 ) {
n = strlen( str );
}
while ((size_t)(str - t) < n && '\0' != *str) {
if (0xf0 == (0xf8 & *str)) {
/* 4-byte utf8 code point (began with 0b11110xxx) */
str += 4;
} else if (0xe0 == (0xf0 & *str)) {
/* 3-byte utf8 code point (began with 0b1110xxxx) */
str += 3;
} else if (0xc0 == (0xe0 & *str)) {
/* 2-byte utf8 code point (began with 0b110xxxxx) */
str += 2;
} else { /* if (0x00 == (0x80 & *s)) { */
/* 1-byte ascii (began with 0b0xxxxxxx) */
str += 1;
}
/* no matter the bytes we marched s forward by, it was
* only 1 utf8 codepoint */
length++;
}
if ((size_t)(str - t) > n) {
length--;
}
return length;
}
// cuts off str (by writing \0 char) after n Unicode codepoints
// returns number of bytes that remain in string => returns strlen(str) (after cutting off)
// if str contains <= n codepoints, it's not modified and the number of bytes in it
// is still returned (excluding terminating \0)
// based on utf8nlen from https://github.com/sheredom/utf8.h/blob/master/utf8.h
size_t D3_UTF8CutOffAfterNCodepoints( char *str, size_t n )
{
const char *t = str;
size_t length = 0;
while ('\0' != *str) {
if (0xf0 == (0xf8 & *str)) {
/* 4-byte utf8 code point (began with 0b11110xxx) */
str += 4;
} else if (0xe0 == (0xf0 & *str)) {
/* 3-byte utf8 code point (began with 0b1110xxxx) */
str += 3;
} else if (0xc0 == (0xe0 & *str)) {
/* 2-byte utf8 code point (began with 0b110xxxxx) */
str += 2;
} else { /* if (0x00 == (0x80 & *s)) { */
/* 1-byte ascii (began with 0b0xxxxxxx) */
str += 1;
}
/* no matter the bytes we marched s forward by, it was
* only 1 utf8 codepoint */
length++;
/* if we have reached the desired amount of codepoints, cut the rest off */
if ( length == n ) {
*str = '\0';
break;
}
}
return (size_t)(str - t);
}

View file

@ -1082,4 +1082,25 @@ int D3_snprintfC99(char *dst, size_t size, const char *format, ...) id_attribute
// unlike idStr::vsnPrintf() which returns -1 in that case
int D3_vsnprintfC99(char *dst, size_t size, const char *format, va_list ap);
// convert UTF-8 to ISU8859-1 (the "High ASCII" 8-bit encoding Doom3 uses)
// invalidChar is inserted into the output buffer for unicode characters that can't be
// represented by ISO8859-1; if it's 0, those will just be skipped
char * D3_UTF8toISO8859_1( const char *utf8str, char *isobuf, int n, char invalidChar=0 );
// convert ISO8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) to UTF-8
// returns NULL on error (need more than utf8bufLen chars in utf8buf)
char * D3_ISO8859_1toUTF8( const char* isoStr, char *utf8buf, int utf8bufLen );
// returns number of Unicode codepoints (UTF32 char) in given UTF-8 string.
// if n >= 0, it only looks at the first n bytes of str (but still stops at the first \0)
// that's not necessarily the number of printed characters (as unicode allows graphemes that
// consist of multiple codepoints), but for our purposes (limiting to Latin1 subset) it is..
size_t D3_UTF8CountCodepoints( const char *str, size_t n = -1 );
// cuts off str (by writing \0 char) after n Unicode codepoints
// returns number of bytes that remain in string => returns strlen(str) (after cutting off)
// if str contains <= n codepoints, it's not modified and the number of bytes in it
// is still returned (excluding terminating \0)
size_t D3_UTF8CutOffAfterNCodepoints( char *str, size_t n );
#endif /* !__STR_H__ */

View file

@ -143,11 +143,6 @@ static float joyAxis[MAX_JOYSTICK_AXIS];
static idList<sysEvent_t> event_overflow;
#if SDL_VERSION_ATLEAST(2, 0, 0)
// for utf8ToISO8859_1() - used for non-ascii text input and Sys_GetLocalizedScancodeName()
static SDL_iconv_t iconvDesc = (SDL_iconv_t)-1;
#endif
struct scancodename_t {
int sdlScancode;
const char* name;
@ -243,39 +238,6 @@ static bool isAscii( const char* str_ ) {
}
return true;
}
// convert inbuf (which is expected to be in UTF-8) to outbuf (in ISO-8859-1)
static bool utf8ToISO8859_1(const char* inbuf, char* outbuf, size_t outsize) {
if ( iconvDesc == (SDL_iconv_t)-1 ) {
return false;
}
size_t outbytesleft = outsize;
size_t inbytesleft = strlen( inbuf ) + 1; // + terminating \0
size_t ret = SDL_iconv( iconvDesc, &inbuf, &inbytesleft, &outbuf, &outbytesleft );
while(inbytesleft > 0) {
switch ( ret ) {
case SDL_ICONV_E2BIG:
outbuf[outbytesleft-1] = '\0'; // whatever, just cut it off..
common->DPrintf( "Cutting off UTF-8 to ISO-8859-1 conversion to '%s' because destination is too small for '%s'\n", outbuf, inbuf );
SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion
return true;
case SDL_ICONV_EILSEQ:
// try skipping invalid input data
++inbuf;
--inbytesleft;
break;
case SDL_ICONV_EINVAL:
case SDL_ICONV_ERROR:
// we can't recover from this
SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion
return false;
}
}
SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion
return outbytesleft < outsize; // return false if no char was written
}
#endif // SDL2
// start button isn't bindable, but I want to use its name in the imgui-based menu
@ -471,7 +433,8 @@ static const char* getLocalizedScancodeName( int key, bool useUtf8 )
}
static char isoName[32];
// try to convert name to ISO8859-1 (Doom3's supported "High ASCII")
if ( utf8ToISO8859_1( ret, isoName, sizeof(isoName) ) && isoName[0] != '\0' ) {
// TODO: pass '?' as invalidChar?
if ( D3_UTF8toISO8859_1( ret, isoName, sizeof(isoName) ) && isoName[0] != '\0' ) {
return isoName;
}
}
@ -876,13 +839,6 @@ void Sys_InitInput() {
#if !SDL_VERSION_ATLEAST(2, 0, 0)
SDL_EnableUNICODE(1);
SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, SDL_DEFAULT_REPEAT_INTERVAL);
#else // SDL2 - for utf8ToISO8859_1() (non-ascii text input and key naming)
assert(iconvDesc == (SDL_iconv_t)-1);
iconvDesc = SDL_iconv_open( "ISO-8859-1", "UTF-8" );
if( iconvDesc == (SDL_iconv_t)-1 ) {
common->Warning( "Sys_SetInput(): iconv_open( \"ISO-8859-1\", \"UTF-8\" ) failed! Can't translate non-ascii input!\n" );
}
#endif
in_kbd.SetModified();
@ -932,10 +888,6 @@ void Sys_ShutdownInput() {
mouse_polls.Clear();
joystick_polls.Clear();
event_overflow.Clear();
#if SDL_VERSION_ATLEAST(2, 0, 0)
SDL_iconv_close( iconvDesc ); // used by utf8ToISO8859_1()
iconvDesc = ( SDL_iconv_t ) -1;
#endif
}
/*
@ -1326,7 +1278,7 @@ sysEvent_t Sys_GetEvent() {
s_pos = 1; // pos 0 is returned
}
return res;
} else if( utf8ToISO8859_1( ev.text.text, s, sizeof(s) ) && s[0] != '\0' ) {
} else if( D3_UTF8toISO8859_1( ev.text.text, s, sizeof(s) ) && s[0] != '\0' ) {
res.evValue = (unsigned char)s[0];
if ( s[1] == '\0' ) {
s_pos = 0;