Improve string validation .. check for invalid unicode characters.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@22712 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Richard Frith-MacDonald 2006-03-26 10:59:57 +00:00
parent ba8f64f9e1
commit 55e0ca9228
5 changed files with 218 additions and 13 deletions

View file

@ -1,6 +1,11 @@
2006-03-25 Richard Frith-Macdonald <rfm@gnu.org>
* Source/win32/NSStreamWin32.m: Variout tidyups for pipe streams.
* Source/win32/NSStreamWin32.m: Various tidyups for pipe streams.
* Source/NSDictionary.m: Avoid compiler warning.
* Source/Additions/Unicode.m: Add unicode validation function.
* Headers/Additions/GNUstepBase/Unicode.h: ditto
* Source/GSString.m: validate unicode when initialisiung a string.
Also create 8bit data strings rather than 16bit where possible.
2006-03-24 Richard Frith-Macdonald <rfm@gnu.org>

View file

@ -64,6 +64,8 @@ GS_EXPORT unichar *uni_is_decomp(unichar u);
#define GSUniBOM 0x08
#define GSUniShortOk 0x10
GS_EXPORT BOOL GSIsUnicode(const unichar *chars, unsigned length,
BOOL *isASCII, BOOL *isLatin1);
GS_EXPORT BOOL GSFromUnicode(unsigned char **dst, unsigned int *size,
const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone,
unsigned int options);

View file

@ -1090,6 +1090,66 @@ int encode_cstrtoustr(unichar *dst, int dl, const char *src, int sl,
}
/**
* Function to check a block of data for validity as a unicode string and
* say whether it contains solely ASCII or solely Latin1 data.<br />
* Any leading BOM must already have been removed and the data must already
* be in native byte order.
*/
BOOL
GSIsUnicode(const unichar *chars, unsigned length,
BOOL *isASCII, BOOL *isLatin1)
{
unsigned i = 0;
unichar c;
*isASCII = YES;
*isLatin1 = YES;
while (i < length)
{
if ((c = chars[i++]) > 127)
{
*isASCII = NO;
i--;
while (i < length)
{
if ((c = chars[i++]) > 255)
{
*isLatin1 = NO;
i--;
while (i < length)
{
c = chars[i++];
if (c == 0xfffe || c == 0xffff
|| (c >= 0xfdd0 && c <= 0xfdef))
{
return NO; // Non-characters.
}
if (c >= 0xdc00 && c <= 0xdfff)
{
return NO; // Second half of a surrogate pair.
}
if (c >= 0xd800 && c <= 0xdbff)
{
// First half of a surrogate pair.
if (i >= length)
{
return NO; // Second half missing
}
c = chars[i];
if (c < 0xdc00 || c > 0xdfff)
{
return NO; // Second half missing
}
i++; // Step past second half
}
}
}
}
}
}
return YES;
}
#define GROW() \
if (dst == 0) \

View file

@ -355,14 +355,37 @@ setup(void)
length: (unsigned)length
{
GSStr me;
BOOL isASCII;
BOOL isLatin1;
me = (GSStr)NSAllocateObject(GSUnicodeInlineStringClass,
length*sizeof(unichar), GSObjCZone(self));
me->_contents.u = (unichar*)&((GSUnicodeInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 1;
me->_flags.free = 1;
memcpy(me->_contents.u, chars, length*sizeof(unichar));
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
{
return nil; // Invalid data
}
if (isASCII == YES
|| (intEnc == NSISOLatin1StringEncoding && isLatin1 == YES))
{
me = (GSStr)NSAllocateObject(GSCInlineStringClass, length,
GSObjCZone(self));
me->_contents.c = (unsigned char*)&((GSCInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 0;
me->_flags.free = 1;
while (length-- > 0)
{
me->_contents.c[length] = (unsigned char)chars[length];
}
}
else
{
me = (GSStr)NSAllocateObject(GSUnicodeInlineStringClass,
length*sizeof(unichar), GSObjCZone(self));
me->_contents.u = (unichar*)&((GSUnicodeInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 1;
me->_flags.free = 1;
memcpy(me->_contents.u, chars, length*sizeof(unichar));
}
return (id)me;
}
@ -374,14 +397,41 @@ setup(void)
freeWhenDone: (BOOL)flag
{
GSStr me;
BOOL isASCII;
BOOL isLatin1;
me = (GSStr)NSAllocateObject(GSUnicodeBufferStringClass, 0, GSObjCZone(self));
me->_contents.u = chars;
me->_count = length;
me->_flags.wide = 1;
if (flag == YES)
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
{
return nil; // Invalid data
}
if (isASCII == YES
|| (intEnc == NSISOLatin1StringEncoding && isLatin1 == YES))
{
/*
* OK ... we can do a more compact version
*/
me = (GSStr)NSAllocateObject(GSCInlineStringClass, length,
GSObjCZone(self));
me->_contents.c = (unsigned char*)&((GSCInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 0;
me->_flags.free = 1;
while (length-- > 0)
{
me->_contents.c[length] = (unsigned char)chars[length];
}
}
else
{
me = (GSStr)NSAllocateObject(GSUnicodeBufferStringClass,
0, GSObjCZone(self));
me->_contents.u = chars;
me->_count = length;
me->_flags.wide = 1;
if (flag == YES)
{
me->_flags.free = 1;
}
}
return (id)me;
}
@ -3007,6 +3057,35 @@ agree, create a new GSUnicodeInlineString otherwise.
length: (unsigned int)length
freeWhenDone: (BOOL)flag
{
BOOL isASCII;
BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
{
RELEASE(self);
return nil; // Invalid data
}
if (isASCII == YES
|| (intEnc == NSISOLatin1StringEncoding && isLatin1 == YES))
{
GSStr me;
/*
* OK ... we can do a more compact version
*/
me = (GSStr)NSAllocateObject(GSCInlineStringClass, length,
GSObjCZone(self));
me->_contents.c = (unsigned char*)&((GSCInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 0;
me->_flags.free = 1;
while (length-- > 0)
{
me->_contents.c[length] = (unsigned char)chars[length];
}
RELEASE(self);
return (id)me;
}
if (_contents.u != 0)
{
[NSException raise: NSInternalInconsistencyException
@ -3038,6 +3117,35 @@ agree, create a new GSUnicodeInlineString otherwise.
@implementation GSUnicodeInlineString
- (id) initWithCharacters: (const unichar*)chars length: (unsigned)length
{
BOOL isASCII;
BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
{
RELEASE(self);
return nil; // Invalid data
}
if (isASCII == YES
|| (intEnc == NSISOLatin1StringEncoding && isLatin1 == YES))
{
GSStr me;
/*
* OK ... we can do a more compact version
*/
me = (GSStr)NSAllocateObject(GSCInlineStringClass, length,
GSObjCZone(self));
me->_contents.c = (unsigned char*)&((GSCInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 0;
me->_flags.free = 1;
while (length-- > 0)
{
me->_contents.c[length] = (unsigned char)chars[length];
}
RELEASE(self);
return (id)me;
}
if (_contents.u != 0)
{
[NSException raise: NSInternalInconsistencyException
@ -3415,6 +3523,35 @@ agree, create a new GSUnicodeInlineString otherwise.
length: (unsigned int)length
freeWhenDone: (BOOL)flag
{
BOOL isASCII;
BOOL isLatin1;
if (GSIsUnicode(chars, length, &isASCII, &isLatin1) == NO)
{
RELEASE(self);
return nil; // Invalid data
}
if (isASCII == YES
|| (intEnc == NSISOLatin1StringEncoding && isLatin1 == YES))
{
GSStr me;
/*
* OK ... we can do a more compact version
*/
me = (GSStr)NSAllocateObject(GSCInlineStringClass, length,
GSObjCZone(self));
me->_contents.c = (unsigned char*)&((GSCInlineString*)me)[1];
me->_count = length;
me->_flags.wide = 0;
me->_flags.free = 1;
while (length-- > 0)
{
me->_contents.c[length] = (unsigned char)chars[length];
}
RELEASE(self);
return (id)me;
}
_count = length;
_capacity = length;
_contents.u = chars;

View file

@ -39,6 +39,7 @@
#include "Foundation/NSDebug.h"
#include "Foundation/NSObjCRuntime.h"
#include "Foundation/NSValue.h"
#include "Foundation/NSKeyValueCoding.h"
// For private method _decodeArrayOfObjectsForKey:
#include "Foundation/NSKeyedArchiver.h"
#include "GNUstepBase/GSCategories.h"