UTF8 parsing improvements

This commit is contained in:
Richard Frith-Macdonald 2018-07-14 07:25:40 +01:00
parent 053862f652
commit e60b2004af
3 changed files with 114 additions and 9 deletions

View file

@ -1,3 +1,9 @@
2018-07-14 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/Unicode.m: improve utf8 validity checks,
switch to state machine based utf8 parsing for better performance.
* Tests/base/NSString/utf8.m: add a few tests for utf8 parsing.
2018-07-10 Richard Frith-Macdonald <rfm@gnu.org>
* configure.ac:

View file

@ -926,6 +926,22 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
{
int i, sle = 0;
/* legal first byte of a multibyte character?
*/
if (c <= 0xc1 || c >= 0xf5)
{
/* (0x7f <= c < 0xc0) means this is a continuation
* of a multibyte character without the first byte.
*
* (0xc0 == c || 0xc1 == c) are always illegal because
*
* (c >= 0xf5) would be for a multibyte character
* outside the unicode range.
*/
result = NO;
goto done;
}
/* calculated the expected sequence length */
while (c & 0x80)
{
@ -933,18 +949,11 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
sle++;
}
/* legal ? */
if ((sle < 2) || (sle > 6))
{
result = NO;
goto done;
}
/* do we have enough bytes ? */
if ((spos + sle) > slen)
{
result = NO;
goto done;
result = NO;
goto done;
}
/* get the codepoint */
@ -962,11 +971,40 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
u = u & ~(0xffffffff << ((5 * sle) + 1));
spos += sle;
/* How many bytes needed to encode this character?
*/
if (u < 0x80)
{
i = 1;
}
else if (u < 0x800)
{
i = 2;
}
else if (u < 0x10000)
{
i = 3;
}
else
{
i = 4;
}
if (0 && i < sle)
{
result = NO; // Character was not minimally encoded.
goto done;
}
if ((u >= 0xd800) && (u <= 0xdfff))
{
result = NO; // Unmatched half of surrogate pair.
goto done;
}
if (u > 0x10ffff)
{
result = NO; // Outside the unicode range.
goto done;
}
}
else
{

View file

@ -0,0 +1,61 @@
#import <Foundation/NSString.h>
#import <Foundation/NSRegularExpression.h>
#import "ObjectTesting.h"
int main(void)
{
[NSAutoreleasePool new];
START_SET("NSString + utf8")
NSString *exp;
NSString *str;
uint16_t uni[2];
uint8_t buf[8];
buf[0] = 0xc0;
buf[1] = 0x00;
str = [NSString stringWithUTF8String: buf];
PASS_EQUAL(str, nil, "bare 0xc0 is illegal")
buf[0] = 0xc0;
buf[1] = 0x80;
buf[2] = 0x00;
str = [NSString stringWithUTF8String: buf];
PASS_EQUAL(str, nil, "non-minimal sequence is illegal")
buf[0] = 0xed;
buf[1] = 0xa0;
buf[2] = 0x80;
str = [NSString stringWithUTF8String: buf];
PASS_EQUAL(str, nil, "lone high surrogate pair char is illegal")
buf[0] = 0xed;
buf[1] = 0xb0;
buf[2] = 0x80;
str = [NSString stringWithUTF8String: buf];
PASS_EQUAL(str, nil, "lone low surrogate pair char is illegal")
buf[0] = 0xf4;
buf[1] = 0x90;
buf[2] = 0x80;
buf[3] = 0x80;
buf[4] = 0x00;
str = [NSString stringWithUTF8String: buf];
PASS_EQUAL(str, nil, "character too large is illegal")
uni[0] = 0xdbff;
uni[1] = 0xdfff;
exp = [[NSString alloc] initWithCharacters: uni length: 2];
buf[0] = 0xf4;
buf[1] = 0x8f;
buf[2] = 0xbf;
buf[3] = 0xbf;
buf[4] = 0x00;
str = [NSString stringWithUTF8String: buf];
PASS_EQUAL(str, exp, "maximum unicode character ok")
END_SET("NSString + utf8")
return 0;
}