Be aware of unicode BOM in UTF8 data.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@14639 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
CaS 2002-10-04 09:32:48 +00:00
parent 72d997aa58
commit 89663963e2
2 changed files with 47 additions and 8 deletions

View file

@ -4,7 +4,8 @@
is released properly on failure, and we don't generate log messages is released properly on failure, and we don't generate log messages
where a lower level API should be doing it. Added some documentation. where a lower level API should be doing it. Added some documentation.
* Source/NSDictionary.m: ditto * Source/NSDictionary.m: ditto
* Source/NSString.m: ditto * Source/NSString.m: ditto. Also add support for understanding the
unicode BOM at the start of UTF8 data and stripping it.
* Source/NSData.m: Tidied read from and write to file, * Source/NSData.m: Tidied read from and write to file,
adding lots of logging information. Also documented quite a bit. adding lots of logging information. Also documented quite a bit.
Resolved all conflicts found with Adam's change ... generally in Resolved all conflicts found with Adam's change ... generally in

View file

@ -1135,6 +1135,14 @@ handle_printf_atsign (FILE *stream,
} }
#endif #endif
/**
* Initialises the receiver with the supplied data, using the
* specified encoding.<br />
* For NSUnicodeStringEncoding and NSUTF8String encoding, a Byte Order
* Marker (if present at the start of the data) is removed automatically.<br />
* If the data can not be interpreted using the encoding, the receiver
* is released and nil is returned.
*/
- (id) initWithData: (NSData*)data - (id) initWithData: (NSData*)data
encoding: (NSStringEncoding)encoding encoding: (NSStringEncoding)encoding
{ {
@ -1161,8 +1169,18 @@ handle_printf_atsign (FILE *stream,
} }
else if (encoding == NSUTF8StringEncoding) else if (encoding == NSUTF8StringEncoding)
{ {
const char *bytes = [data bytes]; const unsigned char *bytes = [data bytes];
unsigned i = 0; unsigned i = 0;
/*
* If the data begins with the UTF8 Byte Order Marker (as a
* signature for UTF8 data) we must remove it.
*/
if (len > 2 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
{
len -= 3;
bytes += 3;
}
if (_ByteEncodingOk) if (_ByteEncodingOk)
{ {
@ -1172,7 +1190,7 @@ handle_printf_atsign (FILE *stream,
*/ */
while (i < len) while (i < len)
{ {
if (((unsigned char*)bytes)[i] > 127) if ((bytes)[i] > 127)
{ {
break; break;
} }
@ -1280,12 +1298,22 @@ handle_printf_atsign (FILE *stream,
} }
/** /**
* Initialises the receiver with the contents of the file at path.<br /> * <p>Initialises the receiver with the contents of the file at path.
* Invokes [NSData-initWithContentsOfFile:] to read the file, then * </p>
* <p>Invokes [NSData-initWithContentsOfFile:] to read the file, then
* examines the data to infer its encoding type, and converts the * examines the data to infer its encoding type, and converts the
* data to a string using -initWithData:encoding:<br /> * data to a string using -initWithData:encoding:
* Releases the receiver and returns nil if the file could not be read * </p>
* <p>The encoding to use is determined as follows ... if the data begins
* with the 16-bit unicode Byte Order Marker, then it is assumed to be
* unicode data in the appropriate ordering and converted as such.<br />
* If it begins with a UTF8 representation of the BOM, the UTF8 encoding
* is used.<br />
* Otherwise, the default C String encoding is used.
* </p>
* <p>Releases the receiver and returns nil if the file could not be read
* and converted to a string. * and converted to a string.
* </p>
*/ */
- (id) initWithContentsOfFile: (NSString*)path - (id) initWithContentsOfFile: (NSString*)path
{ {
@ -1303,6 +1331,8 @@ handle_printf_atsign (FILE *stream,
len = [d length]; len = [d length];
if (len == 0) if (len == 0)
{ {
RELEASE(d);
RELEASE(self);
return @""; return @"";
} }
test = [d bytes]; test = [d bytes];
@ -1313,6 +1343,10 @@ handle_printf_atsign (FILE *stream,
/* somebody set up us the BOM! */ /* somebody set up us the BOM! */
enc = NSUnicodeStringEncoding; enc = NSUnicodeStringEncoding;
} }
else if (len > 2 && test[0] == 0xEF && test[1] == 0xBB && test[2] == 0xBF)
{
enc = NSUTF8StringEncoding;
}
} }
self = [self initWithData: d encoding: enc]; self = [self initWithData: d encoding: enc];
RELEASE(d); RELEASE(d);
@ -1348,6 +1382,10 @@ handle_printf_atsign (FILE *stream,
{ {
enc = NSUnicodeStringEncoding; enc = NSUnicodeStringEncoding;
} }
else if (len > 2 && test[0] == 0xEF && test[1] == 0xBB && test[2] == 0xBF)
{
enc = NSUTF8StringEncoding;
}
} }
self = [self initWithData: d encoding: enc]; self = [self initWithData: d encoding: enc];
if (self == nil) if (self == nil)