From d77062ba8829033646094a3be0a8a254fae9ad2b Mon Sep 17 00:00:00 2001 From: rfm Date: Sun, 26 Mar 2006 11:34:47 +0000 Subject: [PATCH] Validate data returned by dataUsingEncoding:allowLossyConversion: for unicode git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@22713 72102866-910b-0410-8b05-ffd578937521 --- Headers/Additions/GNUstepBase/Unicode.h | 2 +- Source/Additions/Unicode.m | 25 +++++++------- Source/GSString.m | 46 ++++++++++++++++++++----- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/Headers/Additions/GNUstepBase/Unicode.h b/Headers/Additions/GNUstepBase/Unicode.h index aabea0257..abd9b6385 100644 --- a/Headers/Additions/GNUstepBase/Unicode.h +++ b/Headers/Additions/GNUstepBase/Unicode.h @@ -64,7 +64,7 @@ GS_EXPORT unichar *uni_is_decomp(unichar u); #define GSUniBOM 0x08 #define GSUniShortOk 0x10 -GS_EXPORT BOOL GSIsUnicode(const unichar *chars, unsigned length, +GS_EXPORT unsigned GSUnicode(const unichar *chars, unsigned length, BOOL *isASCII, BOOL *isLatin1); GS_EXPORT BOOL GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone, diff --git a/Source/Additions/Unicode.m b/Source/Additions/Unicode.m index a19808f36..b55cd48c1 100644 --- a/Source/Additions/Unicode.m +++ b/Source/Additions/Unicode.m @@ -1094,28 +1094,29 @@ int encode_cstrtoustr(unichar *dst, int dl, const char *src, int sl, * Function to check a block of data for validity as a unicode string and * say whether it contains solely ASCII or solely Latin1 data.
* Any leading BOM must already have been removed and the data must already - * be in native byte order. + * be in native byte order.
+ * Returns the number of characters which were found valid. */ -BOOL -GSIsUnicode(const unichar *chars, unsigned length, +unsigned +GSUnicode(const unichar *chars, unsigned length, BOOL *isASCII, BOOL *isLatin1) { unsigned i = 0; unichar c; - *isASCII = YES; - *isLatin1 = YES; + if (isASCII) *isASCII = YES; + if (isLatin1) *isLatin1 = YES; while (i < length) { if ((c = chars[i++]) > 127) { - *isASCII = NO; + if (isASCII) *isASCII = NO; i--; while (i < length) { if ((c = chars[i++]) > 255) { - *isLatin1 = NO; + if (isLatin1) *isLatin1 = NO; i--; while (i < length) { @@ -1123,23 +1124,23 @@ GSIsUnicode(const unichar *chars, unsigned length, if (c == 0xfffe || c == 0xffff || (c >= 0xfdd0 && c <= 0xfdef)) { - return NO; // Non-characters. + return i - 1; // Non-characters. } if (c >= 0xdc00 && c <= 0xdfff) { - return NO; // Second half of a surrogate pair. + return i - 1; // Second half of a surrogate pair. } if (c >= 0xd800 && c <= 0xdbff) { // First half of a surrogate pair. if (i >= length) { - return NO; // Second half missing + return i - 1; // Second half missing } c = chars[i]; if (c < 0xdc00 || c > 0xdfff) { - return NO; // Second half missing + return i - 1; // Second half missing } i++; // Step past second half } @@ -1148,7 +1149,7 @@ GSIsUnicode(const unichar *chars, unsigned length, } } } - return YES; + return i; } #define GROW() \ diff --git a/Source/GSString.m b/Source/GSString.m index 59eaef984..b692a9d69 100644 --- a/Source/GSString.m +++ b/Source/GSString.m @@ -358,7 +358,7 @@ setup(void) BOOL isASCII; BOOL isLatin1; - if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) + if (GSUnicode(chars, length, &isASCII, &isLatin1) != length) { return nil; // Invalid data } @@ -400,7 +400,7 @@ setup(void) BOOL isASCII; BOOL isLatin1; - if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) + if (GSUnicode(chars, length, &isASCII, &isLatin1) != length) { return nil; // Invalid data } @@ -1286,14 +1286,44 @@ dataUsingEncoding_u(GSStr self, NSStringEncoding encoding, BOOL flag) if (encoding == NSUnicodeStringEncoding) { - unichar *buff; + unichar *buff; + unsigned l; + unsigned from = 0; + unsigned to = 1; + if ((l = GSUnicode(self->_contents.u, len, 0, 0)) != len) + { + if (flag == NO) + { + return nil; + } + } buff = (unichar*)NSZoneMalloc(NSDefaultMallocZone(), sizeof(unichar)*(len+1)); buff[0] = 0xFEFF; - memcpy(buff+1, self->_contents.u, sizeof(unichar)*len); + + while (len > 0) + { + if (l > 0) + { + memcpy(buff + to, self->_contents.u + from, sizeof(unichar)*l); + from += l; + to += l; + len -= l; + } + if (len > 0) + { + // A bad character in the string ... skip it. + if (--len > 0) + { + // Not at end ... try another batch. + from++; + l = GSUnicode(self->_contents.u + from, len, 0, 0); + } + } + } return [NSData dataWithBytesNoCopy: buff - length: sizeof(unichar)*(len+1)]; + length: sizeof(unichar)*to]; } else { @@ -3060,7 +3090,7 @@ agree, create a new GSUnicodeInlineString otherwise. BOOL isASCII; BOOL isLatin1; - if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) + if (GSUnicode(chars, length, &isASCII, &isLatin1) != length) { RELEASE(self); return nil; // Invalid data @@ -3120,7 +3150,7 @@ agree, create a new GSUnicodeInlineString otherwise. BOOL isASCII; BOOL isLatin1; - if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) + if (GSUnicode(chars, length, &isASCII, &isLatin1) != length) { RELEASE(self); return nil; // Invalid data @@ -3526,7 +3556,7 @@ agree, create a new GSUnicodeInlineString otherwise. BOOL isASCII; BOOL isLatin1; - if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO) + if (GSUnicode(chars, length, &isASCII, &isLatin1) != length) { RELEASE(self); return nil; // Invalid data