diff --git a/Headers/Additions/GNUstepBase/Unicode.h b/Headers/Additions/GNUstepBase/Unicode.h
index aabea0257..abd9b6385 100644
--- a/Headers/Additions/GNUstepBase/Unicode.h
+++ b/Headers/Additions/GNUstepBase/Unicode.h
@@ -64,7 +64,7 @@ GS_EXPORT unichar *uni_is_decomp(unichar u);
#define GSUniBOM 0x08
#define GSUniShortOk 0x10
-GS_EXPORT BOOL GSIsUnicode(const unichar *chars, unsigned length,
+GS_EXPORT unsigned GSUnicode(const unichar *chars, unsigned length,
BOOL *isASCII, BOOL *isLatin1);
GS_EXPORT BOOL GSFromUnicode(unsigned char **dst, unsigned int *size,
const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone,
diff --git a/Source/Additions/Unicode.m b/Source/Additions/Unicode.m
index a19808f36..b55cd48c1 100644
--- a/Source/Additions/Unicode.m
+++ b/Source/Additions/Unicode.m
@@ -1094,28 +1094,29 @@ int encode_cstrtoustr(unichar *dst, int dl, const char *src, int sl,
* Function to check a block of data for validity as a unicode string and
* say whether it contains solely ASCII or solely Latin1 data.
* Any leading BOM must already have been removed and the data must already
- * be in native byte order.
+ * be in native byte order.
+ * Returns the number of characters which were found valid.
*/
-BOOL
-GSIsUnicode(const unichar *chars, unsigned length,
+unsigned
+GSUnicode(const unichar *chars, unsigned length,
BOOL *isASCII, BOOL *isLatin1)
{
unsigned i = 0;
unichar c;
- *isASCII = YES;
- *isLatin1 = YES;
+ if (isASCII) *isASCII = YES;
+ if (isLatin1) *isLatin1 = YES;
while (i < length)
{
if ((c = chars[i++]) > 127)
{
- *isASCII = NO;
+ if (isASCII) *isASCII = NO;
i--;
while (i < length)
{
if ((c = chars[i++]) > 255)
{
- *isLatin1 = NO;
+ if (isLatin1) *isLatin1 = NO;
i--;
while (i < length)
{
@@ -1123,23 +1124,23 @@ GSIsUnicode(const unichar *chars, unsigned length,
if (c == 0xfffe || c == 0xffff
|| (c >= 0xfdd0 && c <= 0xfdef))
{
- return NO; // Non-characters.
+ return i - 1; // Non-characters.
}
if (c >= 0xdc00 && c <= 0xdfff)
{
- return NO; // Second half of a surrogate pair.
+ return i - 1; // Second half of a surrogate pair.
}
if (c >= 0xd800 && c <= 0xdbff)
{
// First half of a surrogate pair.
if (i >= length)
{
- return NO; // Second half missing
+ return i - 1; // Second half missing
}
c = chars[i];
if (c < 0xdc00 || c > 0xdfff)
{
- return NO; // Second half missing
+ return i - 1; // Second half missing
}
i++; // Step past second half
}
@@ -1148,7 +1149,7 @@ GSIsUnicode(const unichar *chars, unsigned length,
}
}
}
- return YES;
+ return i;
}
#define GROW() \
diff --git a/Source/GSString.m b/Source/GSString.m
index 59eaef984..b692a9d69 100644
--- a/Source/GSString.m
+++ b/Source/GSString.m
@@ -358,7 +358,7 @@ setup(void)
BOOL isASCII;
BOOL isLatin1;
- if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+ if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{
return nil; // Invalid data
}
@@ -400,7 +400,7 @@ setup(void)
BOOL isASCII;
BOOL isLatin1;
- if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+ if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{
return nil; // Invalid data
}
@@ -1286,14 +1286,44 @@ dataUsingEncoding_u(GSStr self, NSStringEncoding encoding, BOOL flag)
if (encoding == NSUnicodeStringEncoding)
{
- unichar *buff;
+ unichar *buff;
+ unsigned l;
+ unsigned from = 0;
+ unsigned to = 1;
+ if ((l = GSUnicode(self->_contents.u, len, 0, 0)) != len)
+ {
+ if (flag == NO)
+ {
+ return nil;
+ }
+ }
buff = (unichar*)NSZoneMalloc(NSDefaultMallocZone(),
sizeof(unichar)*(len+1));
buff[0] = 0xFEFF;
- memcpy(buff+1, self->_contents.u, sizeof(unichar)*len);
+
+ while (len > 0)
+ {
+ if (l > 0)
+ {
+ memcpy(buff + to, self->_contents.u + from, sizeof(unichar)*l);
+ from += l;
+ to += l;
+ len -= l;
+ }
+ if (len > 0)
+ {
+ // A bad character in the string ... skip it.
+ if (--len > 0)
+ {
+ // Not at end ... try another batch.
+ from++;
+ l = GSUnicode(self->_contents.u + from, len, 0, 0);
+ }
+ }
+ }
return [NSData dataWithBytesNoCopy: buff
- length: sizeof(unichar)*(len+1)];
+ length: sizeof(unichar)*to];
}
else
{
@@ -3060,7 +3090,7 @@ agree, create a new GSUnicodeInlineString otherwise.
BOOL isASCII;
BOOL isLatin1;
- if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+ if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{
RELEASE(self);
return nil; // Invalid data
@@ -3120,7 +3150,7 @@ agree, create a new GSUnicodeInlineString otherwise.
BOOL isASCII;
BOOL isLatin1;
- if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+ if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{
RELEASE(self);
return nil; // Invalid data
@@ -3526,7 +3556,7 @@ agree, create a new GSUnicodeInlineString otherwise.
BOOL isASCII;
BOOL isLatin1;
- if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
+ if (GSUnicode(chars, length, &isASCII, &isLatin1) != length)
{
RELEASE(self);
return nil; // Invalid data