Be aware of unicode BOM in UTF8 data.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@14639 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Richard Frith-Macdonald 2002-10-04 09:32:48 +00:00
parent 5dae3b400b
commit 326694568f
2 changed files with 47 additions and 8 deletions

View file

@ -4,7 +4,8 @@
is released properly on failure, and we don't generate log messages
where a lower level API should be doing it. Added some documentation.
* Source/NSDictionary.m: ditto
* Source/NSString.m: ditto
* Source/NSString.m: ditto. Also add support for understanding the
unicode BOM at the start of UTF8 data and stripping it.
* Source/NSData.m: Tidied read from and write to file,
adding lots of logging information. Also documented quite a bit.
Resolved all conflicts found with Adam's change ... generally in

View file

@ -1135,6 +1135,14 @@ handle_printf_atsign (FILE *stream,
}
#endif
/**
* Initialises the receiver with the supplied data, using the
* specified encoding.<br />
* For NSUnicodeStringEncoding and NSUTF8String encoding, a Byte Order
* Marker (if present at the start of the data) is removed automatically.<br />
* If the data can not be interpreted using the encoding, the receiver
* is released and nil is returned.
*/
- (id) initWithData: (NSData*)data
encoding: (NSStringEncoding)encoding
{
@ -1161,8 +1169,18 @@ handle_printf_atsign (FILE *stream,
}
else if (encoding == NSUTF8StringEncoding)
{
const char *bytes = [data bytes];
unsigned i = 0;
const unsigned char *bytes = [data bytes];
unsigned i = 0;
/*
* If the data begins with the UTF8 Byte Order Marker (as a
* signature for UTF8 data) we must remove it.
*/
if (len > 2 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
{
len -= 3;
bytes += 3;
}
if (_ByteEncodingOk)
{
@ -1172,7 +1190,7 @@ handle_printf_atsign (FILE *stream,
*/
while (i < len)
{
if (((unsigned char*)bytes)[i] > 127)
if ((bytes)[i] > 127)
{
break;
}
@ -1280,12 +1298,22 @@ handle_printf_atsign (FILE *stream,
}
/**
* Initialises the receiver with the contents of the file at path.<br />
* Invokes [NSData-initWithContentsOfFile:] to read the file, then
* <p>Initialises the receiver with the contents of the file at path.
* </p>
* <p>Invokes [NSData-initWithContentsOfFile:] to read the file, then
* examines the data to infer its encoding type, and converts the
* data to a string using -initWithData:encoding:<br />
* Releases the receiver and returns nil if the file could not be read
* data to a string using -initWithData:encoding:
* </p>
* <p>The encoding to use is determined as follows ... if the data begins
* with the 16-bit unicode Byte Order Marker, then it is assumed to be
* unicode data in the appropriate ordering and converted as such.<br />
* If it begins with a UTF8 representation of the BOM, the UTF8 encoding
* is used.<br />
* Otherwise, the default C String encoding is used.
* </p>
* <p>Releases the receiver and returns nil if the file could not be read
* and converted to a string.
* </p>
*/
- (id) initWithContentsOfFile: (NSString*)path
{
@ -1303,6 +1331,8 @@ handle_printf_atsign (FILE *stream,
len = [d length];
if (len == 0)
{
RELEASE(d);
RELEASE(self);
return @"";
}
test = [d bytes];
@ -1313,6 +1343,10 @@ handle_printf_atsign (FILE *stream,
/* somebody set up us the BOM! */
enc = NSUnicodeStringEncoding;
}
else if (len > 2 && test[0] == 0xEF && test[1] == 0xBB && test[2] == 0xBF)
{
enc = NSUTF8StringEncoding;
}
}
self = [self initWithData: d encoding: enc];
RELEASE(d);
@ -1348,6 +1382,10 @@ handle_printf_atsign (FILE *stream,
{
enc = NSUnicodeStringEncoding;
}
else if (len > 2 && test[0] == 0xEF && test[1] == 0xBB && test[2] == 0xBF)
{
enc = NSUTF8StringEncoding;
}
}
self = [self initWithData: d encoding: enc];
if (self == nil)