mirror of
https://github.com/dhewm/dhewm3.git
synced 2024-11-26 14:21:18 +00:00
Add some functions to handle UTF-8 strings
- convert to/from ISO8859-1 (Doom3's "High ASCII" encoding) - count Unicode codepoints in UTF-8 string - cut UTF-8 string off after N codepoints - use the conversion function to replace iconv in sys/events.cpp
This commit is contained in:
parent
30e4a9bb51
commit
6dc36c6175
3 changed files with 192 additions and 51 deletions
|
@ -1880,3 +1880,171 @@ int D3_snprintfC99(char *dst, size_t size, const char *format, ...)
|
||||||
va_end( argptr );
|
va_end( argptr );
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// convert UTF-8 to ISU8859-1 (the "High ASCII" 8-bit encoding Doom3 uses)
|
||||||
|
// invalidChar is inserted into the output buffer for unicode characters that can't be
|
||||||
|
// represented by ISO8859-1; if it's 0, those will just be skipped
|
||||||
|
// returns NULL on error (need more than n chars in isobuf or invalid encoding in utf8str)
|
||||||
|
// based on stb_from_utf8 from https://github.com/nothings/stb/blob/master/deprecated/stb.h#L1010
|
||||||
|
char * D3_UTF8toISO8859_1( const char *utf8str, char *isobuf, int isobufLen, char invalidChar )
|
||||||
|
{
|
||||||
|
const unsigned char *str = (const unsigned char *) utf8str;
|
||||||
|
unsigned char* buffer = (unsigned char *)isobuf;
|
||||||
|
unsigned c;
|
||||||
|
int i=0;
|
||||||
|
int n = isobufLen - 1;
|
||||||
|
while (*str) {
|
||||||
|
if (i >= n)
|
||||||
|
return NULL;
|
||||||
|
if (!(*str & 0x80)) // ASCII char => copy directly
|
||||||
|
buffer[i++] = *str++;
|
||||||
|
else if ((*str & 0xe0) == 0xc0) {
|
||||||
|
// Unicode character between 0x0080 and 0x07FF
|
||||||
|
// => might be representable by ISO8859-1
|
||||||
|
if (*str < 0xc2) return NULL;
|
||||||
|
c = (*str++ & 0x1f) << 6;
|
||||||
|
if ((*str & 0xc0) != 0x80) return NULL;
|
||||||
|
c += (*str++ & 0x3f);
|
||||||
|
if ( c < 0xFF )
|
||||||
|
buffer[i++] = c;
|
||||||
|
else if ( invalidChar != 0 )
|
||||||
|
buffer[i++] = invalidChar;
|
||||||
|
} else if ((*str & 0xf0) == 0xe0) {
|
||||||
|
// Unicode character between 0x0800 and 0xFFF => way out of range for ISO8859-1
|
||||||
|
// so just validate and skip the input bytes
|
||||||
|
if (*str == 0xe0 && (str[1] < 0xa0 || str[1] > 0xbf)) return NULL;
|
||||||
|
if (*str == 0xed && str[1] > 0x9f) return NULL; // str[1] < 0x80 is checked below
|
||||||
|
//c = (*str++ & 0x0f) << 12;
|
||||||
|
++str;
|
||||||
|
if ((*str & 0xc0) != 0x80) return NULL;
|
||||||
|
//c += (*str++ & 0x3f) << 6;
|
||||||
|
++str;
|
||||||
|
if ((*str & 0xc0) != 0x80) return NULL;
|
||||||
|
//buffer[i++] = c + (*str++ & 0x3f);
|
||||||
|
++str;
|
||||||
|
if ( invalidChar != 0 )
|
||||||
|
buffer[i++] = invalidChar;
|
||||||
|
} else if ((*str & 0xf8) == 0xf0) {
|
||||||
|
// Unicode character between 0x010000 and 0x10FFFF => even more out of range
|
||||||
|
// again, validate and skip
|
||||||
|
if (*str > 0xf4) return NULL;
|
||||||
|
if (*str == 0xf0 && (str[1] < 0x90 || str[1] > 0xbf)) return NULL;
|
||||||
|
if (*str == 0xf4 && str[1] > 0x8f) return NULL; // str[1] < 0x80 is checked below
|
||||||
|
c = (*str++ & 0x07) << 18;
|
||||||
|
if ((*str & 0xc0) != 0x80) return NULL;
|
||||||
|
c += (*str++ & 0x3f) << 12;
|
||||||
|
if ((*str & 0xc0) != 0x80) return NULL;
|
||||||
|
c += (*str++ & 0x3f) << 6;
|
||||||
|
if ((*str & 0xc0) != 0x80) return NULL;
|
||||||
|
c += (*str++ & 0x3f);
|
||||||
|
// utf-8 encodings of values used in surrogate pairs are invalid
|
||||||
|
if ((c & 0xFFFFF800) == 0xD800) return NULL;
|
||||||
|
if ( invalidChar != 0 )
|
||||||
|
buffer[i++] = invalidChar;
|
||||||
|
} else
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
buffer[i] = '\0';
|
||||||
|
return isobuf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert ISO8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) to UTF-8
|
||||||
|
// returns NULL on error (need more than utf8bufLen chars in utf8buf)
|
||||||
|
// based on stb_to_utf8 from https://github.com/nothings/stb/blob/master/deprecated/stb.h#L1060
|
||||||
|
char * D3_ISO8859_1toUTF8( const char* isoStr, char *utf8buf, int utf8bufLen )
|
||||||
|
{
|
||||||
|
const unsigned char *str = (const unsigned char *)isoStr;
|
||||||
|
unsigned char *buffer = (unsigned char *)utf8buf;
|
||||||
|
int i=0;
|
||||||
|
int n = utf8bufLen - 1;
|
||||||
|
while (*str) {
|
||||||
|
if (i >= n)
|
||||||
|
return NULL;
|
||||||
|
if (*str < 0x80) {
|
||||||
|
buffer[i++] = *str++;
|
||||||
|
} else {
|
||||||
|
buffer[i++] = 0xc0 + (*str >> 6);
|
||||||
|
buffer[i++] = 0x80 + (*str & 0x3f);
|
||||||
|
++str;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buffer[i] = '\0';
|
||||||
|
return utf8buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns number of Unicode codepoints (UTF32 char) in given UTF-8 string.
|
||||||
|
// if n >= 0, it only looks at the first n bytes of str (but still stops at the first \0)
|
||||||
|
// that's not necessarily the number of printed characters (as unicode allows graphemes that
|
||||||
|
// consist of multiple codepoints), but for our purposes (limiting to Latin1 subset) it is..
|
||||||
|
// based on utf8nlen from https://github.com/sheredom/utf8.h/blob/master/utf8.h
|
||||||
|
size_t D3_UTF8CountCodepoints( const char *str, size_t n )
|
||||||
|
{
|
||||||
|
const char *t = str;
|
||||||
|
size_t length = 0;
|
||||||
|
if ( n == (size_t)-1 ) {
|
||||||
|
n = strlen( str );
|
||||||
|
}
|
||||||
|
|
||||||
|
while ((size_t)(str - t) < n && '\0' != *str) {
|
||||||
|
if (0xf0 == (0xf8 & *str)) {
|
||||||
|
/* 4-byte utf8 code point (began with 0b11110xxx) */
|
||||||
|
str += 4;
|
||||||
|
} else if (0xe0 == (0xf0 & *str)) {
|
||||||
|
/* 3-byte utf8 code point (began with 0b1110xxxx) */
|
||||||
|
str += 3;
|
||||||
|
} else if (0xc0 == (0xe0 & *str)) {
|
||||||
|
/* 2-byte utf8 code point (began with 0b110xxxxx) */
|
||||||
|
str += 2;
|
||||||
|
} else { /* if (0x00 == (0x80 & *s)) { */
|
||||||
|
/* 1-byte ascii (began with 0b0xxxxxxx) */
|
||||||
|
str += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* no matter the bytes we marched s forward by, it was
|
||||||
|
* only 1 utf8 codepoint */
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((size_t)(str - t) > n) {
|
||||||
|
length--;
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cuts off str (by writing \0 char) after n Unicode codepoints
|
||||||
|
// returns number of bytes that remain in string => returns strlen(str) (after cutting off)
|
||||||
|
// if str contains <= n codepoints, it's not modified and the number of bytes in it
|
||||||
|
// is still returned (excluding terminating \0)
|
||||||
|
// based on utf8nlen from https://github.com/sheredom/utf8.h/blob/master/utf8.h
|
||||||
|
size_t D3_UTF8CutOffAfterNCodepoints( char *str, size_t n )
|
||||||
|
{
|
||||||
|
const char *t = str;
|
||||||
|
size_t length = 0;
|
||||||
|
|
||||||
|
while ('\0' != *str) {
|
||||||
|
if (0xf0 == (0xf8 & *str)) {
|
||||||
|
/* 4-byte utf8 code point (began with 0b11110xxx) */
|
||||||
|
str += 4;
|
||||||
|
} else if (0xe0 == (0xf0 & *str)) {
|
||||||
|
/* 3-byte utf8 code point (began with 0b1110xxxx) */
|
||||||
|
str += 3;
|
||||||
|
} else if (0xc0 == (0xe0 & *str)) {
|
||||||
|
/* 2-byte utf8 code point (began with 0b110xxxxx) */
|
||||||
|
str += 2;
|
||||||
|
} else { /* if (0x00 == (0x80 & *s)) { */
|
||||||
|
/* 1-byte ascii (began with 0b0xxxxxxx) */
|
||||||
|
str += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* no matter the bytes we marched s forward by, it was
|
||||||
|
* only 1 utf8 codepoint */
|
||||||
|
length++;
|
||||||
|
/* if we have reached the desired amount of codepoints, cut the rest off */
|
||||||
|
if ( length == n ) {
|
||||||
|
*str = '\0';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (size_t)(str - t);
|
||||||
|
}
|
||||||
|
|
|
@ -1082,4 +1082,25 @@ int D3_snprintfC99(char *dst, size_t size, const char *format, ...) id_attribute
|
||||||
// unlike idStr::vsnPrintf() which returns -1 in that case
|
// unlike idStr::vsnPrintf() which returns -1 in that case
|
||||||
int D3_vsnprintfC99(char *dst, size_t size, const char *format, va_list ap);
|
int D3_vsnprintfC99(char *dst, size_t size, const char *format, va_list ap);
|
||||||
|
|
||||||
|
// convert UTF-8 to ISU8859-1 (the "High ASCII" 8-bit encoding Doom3 uses)
|
||||||
|
// invalidChar is inserted into the output buffer for unicode characters that can't be
|
||||||
|
// represented by ISO8859-1; if it's 0, those will just be skipped
|
||||||
|
char * D3_UTF8toISO8859_1( const char *utf8str, char *isobuf, int n, char invalidChar=0 );
|
||||||
|
|
||||||
|
// convert ISO8859-1 (the "High ASCII" 8-bit encoding Doom3 uses) to UTF-8
|
||||||
|
// returns NULL on error (need more than utf8bufLen chars in utf8buf)
|
||||||
|
char * D3_ISO8859_1toUTF8( const char* isoStr, char *utf8buf, int utf8bufLen );
|
||||||
|
|
||||||
|
// returns number of Unicode codepoints (UTF32 char) in given UTF-8 string.
|
||||||
|
// if n >= 0, it only looks at the first n bytes of str (but still stops at the first \0)
|
||||||
|
// that's not necessarily the number of printed characters (as unicode allows graphemes that
|
||||||
|
// consist of multiple codepoints), but for our purposes (limiting to Latin1 subset) it is..
|
||||||
|
size_t D3_UTF8CountCodepoints( const char *str, size_t n = -1 );
|
||||||
|
|
||||||
|
// cuts off str (by writing \0 char) after n Unicode codepoints
|
||||||
|
// returns number of bytes that remain in string => returns strlen(str) (after cutting off)
|
||||||
|
// if str contains <= n codepoints, it's not modified and the number of bytes in it
|
||||||
|
// is still returned (excluding terminating \0)
|
||||||
|
size_t D3_UTF8CutOffAfterNCodepoints( char *str, size_t n );
|
||||||
|
|
||||||
#endif /* !__STR_H__ */
|
#endif /* !__STR_H__ */
|
||||||
|
|
|
@ -143,11 +143,6 @@ static float joyAxis[MAX_JOYSTICK_AXIS];
|
||||||
|
|
||||||
static idList<sysEvent_t> event_overflow;
|
static idList<sysEvent_t> event_overflow;
|
||||||
|
|
||||||
#if SDL_VERSION_ATLEAST(2, 0, 0)
|
|
||||||
// for utf8ToISO8859_1() - used for non-ascii text input and Sys_GetLocalizedScancodeName()
|
|
||||||
static SDL_iconv_t iconvDesc = (SDL_iconv_t)-1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct scancodename_t {
|
struct scancodename_t {
|
||||||
int sdlScancode;
|
int sdlScancode;
|
||||||
const char* name;
|
const char* name;
|
||||||
|
@ -243,39 +238,6 @@ static bool isAscii( const char* str_ ) {
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert inbuf (which is expected to be in UTF-8) to outbuf (in ISO-8859-1)
|
|
||||||
static bool utf8ToISO8859_1(const char* inbuf, char* outbuf, size_t outsize) {
|
|
||||||
if ( iconvDesc == (SDL_iconv_t)-1 ) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t outbytesleft = outsize;
|
|
||||||
size_t inbytesleft = strlen( inbuf ) + 1; // + terminating \0
|
|
||||||
size_t ret = SDL_iconv( iconvDesc, &inbuf, &inbytesleft, &outbuf, &outbytesleft );
|
|
||||||
|
|
||||||
while(inbytesleft > 0) {
|
|
||||||
switch ( ret ) {
|
|
||||||
case SDL_ICONV_E2BIG:
|
|
||||||
outbuf[outbytesleft-1] = '\0'; // whatever, just cut it off..
|
|
||||||
common->DPrintf( "Cutting off UTF-8 to ISO-8859-1 conversion to '%s' because destination is too small for '%s'\n", outbuf, inbuf );
|
|
||||||
SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion
|
|
||||||
return true;
|
|
||||||
case SDL_ICONV_EILSEQ:
|
|
||||||
// try skipping invalid input data
|
|
||||||
++inbuf;
|
|
||||||
--inbytesleft;
|
|
||||||
break;
|
|
||||||
case SDL_ICONV_EINVAL:
|
|
||||||
case SDL_ICONV_ERROR:
|
|
||||||
// we can't recover from this
|
|
||||||
SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SDL_iconv( iconvDesc, NULL, NULL, NULL, NULL ); // reset descriptor for next conversion
|
|
||||||
return outbytesleft < outsize; // return false if no char was written
|
|
||||||
}
|
|
||||||
#endif // SDL2
|
#endif // SDL2
|
||||||
|
|
||||||
// start button isn't bindable, but I want to use its name in the imgui-based menu
|
// start button isn't bindable, but I want to use its name in the imgui-based menu
|
||||||
|
@ -471,7 +433,8 @@ static const char* getLocalizedScancodeName( int key, bool useUtf8 )
|
||||||
}
|
}
|
||||||
static char isoName[32];
|
static char isoName[32];
|
||||||
// try to convert name to ISO8859-1 (Doom3's supported "High ASCII")
|
// try to convert name to ISO8859-1 (Doom3's supported "High ASCII")
|
||||||
if ( utf8ToISO8859_1( ret, isoName, sizeof(isoName) ) && isoName[0] != '\0' ) {
|
// TODO: pass '?' as invalidChar?
|
||||||
|
if ( D3_UTF8toISO8859_1( ret, isoName, sizeof(isoName) ) && isoName[0] != '\0' ) {
|
||||||
return isoName;
|
return isoName;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -876,13 +839,6 @@ void Sys_InitInput() {
|
||||||
#if !SDL_VERSION_ATLEAST(2, 0, 0)
|
#if !SDL_VERSION_ATLEAST(2, 0, 0)
|
||||||
SDL_EnableUNICODE(1);
|
SDL_EnableUNICODE(1);
|
||||||
SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, SDL_DEFAULT_REPEAT_INTERVAL);
|
SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, SDL_DEFAULT_REPEAT_INTERVAL);
|
||||||
|
|
||||||
#else // SDL2 - for utf8ToISO8859_1() (non-ascii text input and key naming)
|
|
||||||
assert(iconvDesc == (SDL_iconv_t)-1);
|
|
||||||
iconvDesc = SDL_iconv_open( "ISO-8859-1", "UTF-8" );
|
|
||||||
if( iconvDesc == (SDL_iconv_t)-1 ) {
|
|
||||||
common->Warning( "Sys_SetInput(): iconv_open( \"ISO-8859-1\", \"UTF-8\" ) failed! Can't translate non-ascii input!\n" );
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
in_kbd.SetModified();
|
in_kbd.SetModified();
|
||||||
|
@ -932,10 +888,6 @@ void Sys_ShutdownInput() {
|
||||||
mouse_polls.Clear();
|
mouse_polls.Clear();
|
||||||
joystick_polls.Clear();
|
joystick_polls.Clear();
|
||||||
event_overflow.Clear();
|
event_overflow.Clear();
|
||||||
#if SDL_VERSION_ATLEAST(2, 0, 0)
|
|
||||||
SDL_iconv_close( iconvDesc ); // used by utf8ToISO8859_1()
|
|
||||||
iconvDesc = ( SDL_iconv_t ) -1;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1326,7 +1278,7 @@ sysEvent_t Sys_GetEvent() {
|
||||||
s_pos = 1; // pos 0 is returned
|
s_pos = 1; // pos 0 is returned
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
} else if( utf8ToISO8859_1( ev.text.text, s, sizeof(s) ) && s[0] != '\0' ) {
|
} else if( D3_UTF8toISO8859_1( ev.text.text, s, sizeof(s) ) && s[0] != '\0' ) {
|
||||||
res.evValue = (unsigned char)s[0];
|
res.evValue = (unsigned char)s[0];
|
||||||
if ( s[1] == '\0' ) {
|
if ( s[1] == '\0' ) {
|
||||||
s_pos = 0;
|
s_pos = 0;
|
||||||
|
|
Loading…
Reference in a new issue