mirror of
https://github.com/gnustep/libs-base.git
synced 2025-04-22 16:33:29 +00:00
UTF8 parsing improvements
This commit is contained in:
parent
053862f652
commit
e60b2004af
3 changed files with 114 additions and 9 deletions
|
@ -1,3 +1,9 @@
|
|||
2018-07-14 Richard Frith-Macdonald <rfm@gnu.org>
|
||||
|
||||
* Source/Additions/Unicode.m: improve utf8 validity checks,
|
||||
switch to state machine based utf8 parsing for better performance.
|
||||
* Tests/base/NSString/utf8.m: add a few tests for utf8 parsing.
|
||||
|
||||
2018-07-10 Richard Frith-Macdonald <rfm@gnu.org>
|
||||
|
||||
* configure.ac:
|
||||
|
|
|
@ -926,6 +926,22 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
|
|||
{
|
||||
int i, sle = 0;
|
||||
|
||||
/* legal first byte of a multibyte character?
|
||||
*/
|
||||
if (c <= 0xc1 || c >= 0xf5)
|
||||
{
|
||||
/* (0x7f <= c < 0xc0) means this is a continuation
|
||||
* of a multibyte character without the first byte.
|
||||
*
|
||||
* (0xc0 == c || 0xc1 == c) are always illegal because
|
||||
*
|
||||
* (c >= 0xf5) would be for a multibyte character
|
||||
* outside the unicode range.
|
||||
*/
|
||||
result = NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* calculated the expected sequence length */
|
||||
while (c & 0x80)
|
||||
{
|
||||
|
@ -933,18 +949,11 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
|
|||
sle++;
|
||||
}
|
||||
|
||||
/* legal ? */
|
||||
if ((sle < 2) || (sle > 6))
|
||||
{
|
||||
result = NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* do we have enough bytes ? */
|
||||
if ((spos + sle) > slen)
|
||||
{
|
||||
result = NO;
|
||||
goto done;
|
||||
result = NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* get the codepoint */
|
||||
|
@ -962,11 +971,40 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
|
|||
u = u & ~(0xffffffff << ((5 * sle) + 1));
|
||||
spos += sle;
|
||||
|
||||
/* How many bytes needed to encode this character?
|
||||
*/
|
||||
if (u < 0x80)
|
||||
{
|
||||
i = 1;
|
||||
}
|
||||
else if (u < 0x800)
|
||||
{
|
||||
i = 2;
|
||||
}
|
||||
else if (u < 0x10000)
|
||||
{
|
||||
i = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
i = 4;
|
||||
}
|
||||
if (0 && i < sle)
|
||||
{
|
||||
result = NO; // Character was not minimally encoded.
|
||||
goto done;
|
||||
}
|
||||
|
||||
if ((u >= 0xd800) && (u <= 0xdfff))
|
||||
{
|
||||
result = NO; // Unmatched half of surrogate pair.
|
||||
goto done;
|
||||
}
|
||||
if (u > 0x10ffff)
|
||||
{
|
||||
result = NO; // Outside the unicode range.
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
61
Tests/base/NSString/utf8.m
Normal file
61
Tests/base/NSString/utf8.m
Normal file
|
@ -0,0 +1,61 @@
|
|||
#import <Foundation/NSString.h>
|
||||
#import <Foundation/NSRegularExpression.h>
|
||||
#import "ObjectTesting.h"
|
||||
|
||||
int main(void)
|
||||
{
|
||||
[NSAutoreleasePool new];
|
||||
START_SET("NSString + utf8")
|
||||
|
||||
NSString *exp;
|
||||
NSString *str;
|
||||
uint16_t uni[2];
|
||||
uint8_t buf[8];
|
||||
|
||||
buf[0] = 0xc0;
|
||||
buf[1] = 0x00;
|
||||
str = [NSString stringWithUTF8String: buf];
|
||||
PASS_EQUAL(str, nil, "bare 0xc0 is illegal")
|
||||
|
||||
buf[0] = 0xc0;
|
||||
buf[1] = 0x80;
|
||||
buf[2] = 0x00;
|
||||
str = [NSString stringWithUTF8String: buf];
|
||||
PASS_EQUAL(str, nil, "non-minimal sequence is illegal")
|
||||
|
||||
buf[0] = 0xed;
|
||||
buf[1] = 0xa0;
|
||||
buf[2] = 0x80;
|
||||
str = [NSString stringWithUTF8String: buf];
|
||||
PASS_EQUAL(str, nil, "lone high surrogate pair char is illegal")
|
||||
|
||||
buf[0] = 0xed;
|
||||
buf[1] = 0xb0;
|
||||
buf[2] = 0x80;
|
||||
str = [NSString stringWithUTF8String: buf];
|
||||
PASS_EQUAL(str, nil, "lone low surrogate pair char is illegal")
|
||||
|
||||
buf[0] = 0xf4;
|
||||
buf[1] = 0x90;
|
||||
buf[2] = 0x80;
|
||||
buf[3] = 0x80;
|
||||
buf[4] = 0x00;
|
||||
str = [NSString stringWithUTF8String: buf];
|
||||
PASS_EQUAL(str, nil, "character too large is illegal")
|
||||
|
||||
uni[0] = 0xdbff;
|
||||
uni[1] = 0xdfff;
|
||||
exp = [[NSString alloc] initWithCharacters: uni length: 2];
|
||||
buf[0] = 0xf4;
|
||||
buf[1] = 0x8f;
|
||||
buf[2] = 0xbf;
|
||||
buf[3] = 0xbf;
|
||||
buf[4] = 0x00;
|
||||
str = [NSString stringWithUTF8String: buf];
|
||||
PASS_EQUAL(str, exp, "maximum unicode character ok")
|
||||
|
||||
|
||||
END_SET("NSString + utf8")
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in a new issue