Validate data returned by dataUsingEncoding:allowLossyConversion: for unicode

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@22713 72102866-910b-0410-8b05-ffd578937521
2025-05-31 08:41:03 +00:00 · 2006-03-26 11:34:47 +00:00 · 2006-03-26 11:34:47 +00:00 · d77062ba88
commit d77062ba88
parent 65e8986db1
3 changed files with 52 additions and 21 deletions
--- a/Headers/Additions/GNUstepBase/Unicode.h
+++ b/Headers/Additions/GNUstepBase/Unicode.h
@ -64,7 +64,7 @@ GS_EXPORT unichar *uni_is_decomp(unichar u);
 #define	GSUniBOM	0x08
 #define	GSUniShortOk	0x10
-GS_EXPORT BOOL GSIsUnicode(const unichar *chars, unsigned length,
+GS_EXPORT unsigned GSUnicode(const unichar *chars, unsigned length,
  BOOL *isASCII, BOOL *isLatin1);
 GS_EXPORT BOOL GSFromUnicode(unsigned char **dst, unsigned int *size,
  const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone,
--- a/Source/Additions/Unicode.m
+++ b/Source/Additions/Unicode.m
@ -1094,28 +1094,29 @@ int encode_cstrtoustr(unichar *dst, int dl, const char *src, int sl,
 * Function to check a block of data for validity as a unicode string and
 * say whether it contains solely ASCII or solely Latin1 data.<br />
 * Any leading BOM must already have been removed and the data must already
- * be in native byte order.
+ * be in native byte order.<br />
 * Returns the number of characters which were found valid.
 */
-BOOL
+unsigned
-GSIsUnicode(const unichar *chars, unsigned length,
+GSUnicode(const unichar *chars, unsigned length,
  BOOL *isASCII, BOOL *isLatin1)
 {
  unsigned	i = 0;
  unichar	c;
-  *isASCII = YES;
+  if (isASCII) *isASCII = YES;
-  *isLatin1 = YES;
+  if (isLatin1) *isLatin1 = YES;
  while (i < length)
    {
      if ((c = chars[i++]) > 127)
        {
-	  *isASCII = NO;
+	  if (isASCII) *isASCII = NO;
 	  i--;
 	  while (i < length)
 	    {
 	      if ((c = chars[i++]) > 255)
 		{
-		  *isLatin1 = NO;
+		  if (isLatin1) *isLatin1 = NO;
 		  i--;
 		  while (i < length)
 		    {
@ -1123,23 +1124,23 @@ GSIsUnicode(const unichar *chars, unsigned length,
 		      if (c == 0xfffe || c == 0xffff
 			|| (c >= 0xfdd0 && c <= 0xfdef))
 			{
-			  return NO;	// Non-characters.
+			  return i - 1;	// Non-characters.
 			}
 		      if (c >= 0xdc00 && c <= 0xdfff)
 		        {
-			  return NO;	// Second half of a surrogate pair.
+			  return i - 1;	// Second half of a surrogate pair.
 		        }
 		      if (c >= 0xd800 && c <= 0xdbff)
 		        {
 			  // First half of a surrogate pair.
 			  if (i >= length)
 			    {
-			      return NO;	// Second half missing
+			      return i - 1;	// Second half missing
 			    }
 			  c = chars[i];
 			  if (c < 0xdc00 || c > 0xdfff)
 			    {
-			      return NO;	// Second half missing
+			      return i - 1;	// Second half missing
 			    }
 			  i++;		// Step past second half
 		        }
@ -1148,7 +1149,7 @@ GSIsUnicode(const unichar *chars, unsigned length,
 	    }
        }
    }
-  return YES;
+  return i;
 }
 #define	GROW() \
--- a/Source/GSString.m
+++ b/Source/GSString.m
@ -358,7 +358,7 @@ setup(void)
  BOOL	isASCII;
  BOOL	isLatin1;
-  if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+  if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
    {
      return nil;	// Invalid data
    }
@ -400,7 +400,7 @@ setup(void)
  BOOL	isASCII;
  BOOL	isLatin1;
-  if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+  if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
    {
      return nil;	// Invalid data
    }
@ -1286,14 +1286,44 @@ dataUsingEncoding_u(GSStr self, NSStringEncoding encoding, BOOL flag)
  if (encoding == NSUnicodeStringEncoding)
    {
-      unichar *buff;
+      unichar	*buff;
      unsigned	l;
      unsigned	from = 0;
      unsigned	to = 1;
      if ((l = GSUnicode(self->_contents.u, len, 0, 0)) != len)
        {
 	  if (flag == NO)
 	    {
 	      return nil;
 	    }
 	}
      buff = (unichar*)NSZoneMalloc(NSDefaultMallocZone(),
 	sizeof(unichar)*(len+1));
      buff[0] = 0xFEFF;
-      memcpy(buff+1, self->_contents.u, sizeof(unichar)*len);
+
      while (len > 0)
        {
 	  if (l > 0)
 	    {
 	      memcpy(buff + to, self->_contents.u + from, sizeof(unichar)*l);
 	      from += l;
 	      to += l;
 	      len -= l;
 	    }
 	  if (len > 0)
 	    {
 	      // A bad character in the string ... skip it.
 	      if (--len > 0)
 		{
 		  // Not at end ... try another batch.
 		  from++;
 		  l = GSUnicode(self->_contents.u + from, len, 0, 0);
 		}
 	    }
 	}
      return [NSData dataWithBytesNoCopy: buff
-				  length: sizeof(unichar)*(len+1)];
+				  length: sizeof(unichar)*to];
    }
  else
    {
@ -3060,7 +3090,7 @@ agree, create a new GSUnicodeInlineString otherwise.
  BOOL	isASCII;
  BOOL	isLatin1;
-  if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+  if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
    {
      RELEASE(self);
      return nil;	// Invalid data
@ -3120,7 +3150,7 @@ agree, create a new GSUnicodeInlineString otherwise.
  BOOL	isASCII;
  BOOL	isLatin1;
-  if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+  if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
    {
      RELEASE(self);
      return nil;	// Invalid data
@ -3526,7 +3556,7 @@ agree, create a new GSUnicodeInlineString otherwise.
  BOOL	isASCII;
  BOOL	isLatin1;
-  if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+  if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
    {
      RELEASE(self);
      return nil;	// Invalid data