Validate data returned by dataUsingEncoding:allowLossyConversion: for unicode

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@22713 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
rfm 2006-03-26 11:34:47 +00:00
parent 65e8986db1
commit d77062ba88
3 changed files with 52 additions and 21 deletions

View file

@ -64,7 +64,7 @@ GS_EXPORT unichar *uni_is_decomp(unichar u);
#define GSUniBOM 0x08 #define GSUniBOM 0x08
#define GSUniShortOk 0x10 #define GSUniShortOk 0x10
GS_EXPORT BOOL GSIsUnicode(const unichar *chars, unsigned length, GS_EXPORT unsigned GSUnicode(const unichar *chars, unsigned length,
BOOL *isASCII, BOOL *isLatin1); BOOL *isASCII, BOOL *isLatin1);
GS_EXPORT BOOL GSFromUnicode(unsigned char **dst, unsigned int *size, GS_EXPORT BOOL GSFromUnicode(unsigned char **dst, unsigned int *size,
const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone, const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone,

View file

@ -1094,28 +1094,29 @@ int encode_cstrtoustr(unichar *dst, int dl, const char *src, int sl,
* Function to check a block of data for validity as a unicode string and * Function to check a block of data for validity as a unicode string and
* say whether it contains solely ASCII or solely Latin1 data.<br /> * say whether it contains solely ASCII or solely Latin1 data.<br />
* Any leading BOM must already have been removed and the data must already * Any leading BOM must already have been removed and the data must already
* be in native byte order. * be in native byte order.<br />
* Returns the number of characters which were found valid.
*/ */
BOOL unsigned
GSIsUnicode(const unichar *chars, unsigned length, GSUnicode(const unichar *chars, unsigned length,
BOOL *isASCII, BOOL *isLatin1) BOOL *isASCII, BOOL *isLatin1)
{ {
unsigned i = 0; unsigned i = 0;
unichar c; unichar c;
*isASCII = YES; if (isASCII) *isASCII = YES;
*isLatin1 = YES; if (isLatin1) *isLatin1 = YES;
while (i < length) while (i < length)
{ {
if ((c = chars[i++]) > 127) if ((c = chars[i++]) > 127)
{ {
*isASCII = NO; if (isASCII) *isASCII = NO;
i--; i--;
while (i < length) while (i < length)
{ {
if ((c = chars[i++]) > 255) if ((c = chars[i++]) > 255)
{ {
*isLatin1 = NO; if (isLatin1) *isLatin1 = NO;
i--; i--;
while (i < length) while (i < length)
{ {
@ -1123,23 +1124,23 @@ GSIsUnicode(const unichar *chars, unsigned length,
if (c == 0xfffe || c == 0xffff if (c == 0xfffe || c == 0xffff
|| (c >= 0xfdd0 && c <= 0xfdef)) || (c >= 0xfdd0 && c <= 0xfdef))
{ {
return NO; // Non-characters. return i - 1; // Non-characters.
} }
if (c >= 0xdc00 && c <= 0xdfff) if (c >= 0xdc00 && c <= 0xdfff)
{ {
return NO; // Second half of a surrogate pair. return i - 1; // Second half of a surrogate pair.
} }
if (c >= 0xd800 && c <= 0xdbff) if (c >= 0xd800 && c <= 0xdbff)
{ {
// First half of a surrogate pair. // First half of a surrogate pair.
if (i >= length) if (i >= length)
{ {
return NO; // Second half missing return i - 1; // Second half missing
} }
c = chars[i]; c = chars[i];
if (c < 0xdc00 || c > 0xdfff) if (c < 0xdc00 || c > 0xdfff)
{ {
return NO; // Second half missing return i - 1; // Second half missing
} }
i++; // Step past second half i++; // Step past second half
} }
@ -1148,7 +1149,7 @@ GSIsUnicode(const unichar *chars, unsigned length,
} }
} }
} }
return YES; return i;
} }
#define GROW() \ #define GROW() \

View file

@ -358,7 +358,7 @@ setup(void)
BOOL isASCII; BOOL isASCII;
BOOL isLatin1; BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{ {
return nil; // Invalid data return nil; // Invalid data
} }
@ -400,7 +400,7 @@ setup(void)
BOOL isASCII; BOOL isASCII;
BOOL isLatin1; BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{ {
return nil; // Invalid data return nil; // Invalid data
} }
@ -1286,14 +1286,44 @@ dataUsingEncoding_u(GSStr self, NSStringEncoding encoding, BOOL flag)
if (encoding == NSUnicodeStringEncoding) if (encoding == NSUnicodeStringEncoding)
{ {
unichar *buff; unichar *buff;
unsigned l;
unsigned from = 0;
unsigned to = 1;
if ((l = GSUnicode(self->_contents.u, len, 0, 0)) != len)
{
if (flag == NO)
{
return nil;
}
}
buff = (unichar*)NSZoneMalloc(NSDefaultMallocZone(), buff = (unichar*)NSZoneMalloc(NSDefaultMallocZone(),
sizeof(unichar)*(len+1)); sizeof(unichar)*(len+1));
buff[0] = 0xFEFF; buff[0] = 0xFEFF;
memcpy(buff+1, self->_contents.u, sizeof(unichar)*len);
while (len > 0)
{
if (l > 0)
{
memcpy(buff + to, self->_contents.u + from, sizeof(unichar)*l);
from += l;
to += l;
len -= l;
}
if (len > 0)
{
// A bad character in the string ... skip it.
if (--len > 0)
{
// Not at end ... try another batch.
from++;
l = GSUnicode(self->_contents.u + from, len, 0, 0);
}
}
}
return [NSData dataWithBytesNoCopy: buff return [NSData dataWithBytesNoCopy: buff
length: sizeof(unichar)*(len+1)]; length: sizeof(unichar)*to];
} }
else else
{ {
@ -3060,7 +3090,7 @@ agree, create a new GSUnicodeInlineString otherwise.
BOOL isASCII; BOOL isASCII;
BOOL isLatin1; BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{ {
RELEASE(self); RELEASE(self);
return nil; // Invalid data return nil; // Invalid data
@ -3120,7 +3150,7 @@ agree, create a new GSUnicodeInlineString otherwise.
BOOL isASCII; BOOL isASCII;
BOOL isLatin1; BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{ {
RELEASE(self); RELEASE(self);
return nil; // Invalid data return nil; // Invalid data
@ -3526,7 +3556,7 @@ agree, create a new GSUnicodeInlineString otherwise.
BOOL isASCII; BOOL isASCII;
BOOL isLatin1; BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{ {
RELEASE(self); RELEASE(self);
return nil; // Invalid data return nil; // Invalid data