Improve determination of charset for xml data.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@21191 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Richard Frith-Macdonald 2005-05-07 08:54:57 +00:00
parent 1fa4d4de0a
commit 121a44b86a
2 changed files with 212 additions and 74 deletions

View file

@ -1,3 +1,11 @@
2005-05-07 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/GSMime.m: Improve handling of text/xml without
a charset. Add new method to determine the correct charset in the
case where it has to be inferred from the actual data. Follow the
instructions in http://www.w3.org/TR/REC-xml/ but be a bit more
tolerant. (ebcdic not supported).
2005-05-04 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/GSMime.m: If given content of type text/xml without

View file

@ -53,6 +53,7 @@
#include "config.h"
#include <Foundation/Foundation.h>
#include "GNUstepBase/GSMime.h"
#include "GNUstepBase/GSXML.h"
#include "GNUstepBase/GSCategories.h"
#include <string.h>
#include <ctype.h>
@ -688,6 +689,164 @@ wordData(NSString *word)
return AUTORELEASE([[self alloc] init]);
}
/*
* Examine xml data to find out the characterset needed to convert from
* binary data to an NSString object.
*/
+ (NSString*) charsetForXml: (NSData*)xml
{
unsigned int length = [xml length];
const unsigned char *ptr = (const unsigned char*)[xml bytes];
const unsigned char *end = ptr + length;
unsigned int offset = 0;
unsigned int size = 1;
unsigned char quote = 0;
unsigned char buffer[30];
unsigned int buflen = 0;
BOOL found = NO;
if (length < 4)
{
// Not long enough to determine an encoding
return nil;
}
/*
* Determine encoding using byte-order-mark if present
*/
if ((ptr[0] == 0xFE && ptr[1] == 0xFF)
|| (ptr[0] == 0xFF && ptr[1] == 0xFE))
{
return @"utf-16";
}
if (ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF)
{
return @"utf-8";
}
if ((ptr[0] == 0x00 && ptr[1] == 0x00)
&& ((ptr[2] == 0xFE && ptr[3] == 0xFF)
|| (ptr[2] == 0xFF && ptr[3] == 0xFE)))
{
return @"ucs-4";
}
/*
* Look for nul bytes to determine whether this is a four byte
* encoding or a two byte encoding (or the default).
*/
if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0)
{
offset = 3;
size = 4;
}
else if (ptr[0] == 0 && ptr[1] == 0 && ptr[3] == 0)
{
offset = 2;
size = 4;
}
else if (ptr[0] == 0 && ptr[2] == 0 && ptr[3] == 0)
{
offset = 1;
size = 4;
}
else if (ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0)
{
offset = 0;
size = 4;
}
else if (ptr[0] == 0)
{
offset = 1;
size = 2;
}
else if (ptr[1] == 0)
{
offset = 0;
size = 2;
}
/*
* Now look for the xml encoding declaration ...
*/
// Tolerate leading whitespace
while (ptr + size <= end && isspace(ptr[offset])) ptr += size;
if (ptr + (size * 20) >= end || ptr[offset] != '<' || ptr[offset+size] != '?')
{
if (size == 1)
{
return @"utf-8";
}
else if (size == 2)
{
return @"utf-16";
}
else
{
return @"ucs-4";
}
}
ptr += size * 5; // Step past '<?xml' prefix
while (ptr + size <= end)
{
unsigned char c = ptr[offset];
ptr += size;
if (quote == 0)
{
if (c == '\'' || c == '"')
{
buflen = 0;
quote = c;
}
else
{
if (isspace(c) || c == '=')
{
if (buflen == 8)
{
buffer[8] = '\0';
if (strcasecmp(buffer, "encoding") == 0)
{
found = YES;
}
}
buflen = 0;
}
else
{
if (buflen == sizeof(buffer)) buflen = 0;
buffer[buflen++] = c;
}
}
}
else if (c == quote)
{
if (found == YES)
{
NSString *tmp;
tmp = [[NSString alloc] initWithBytes: buffer
length: buflen
encoding: NSASCIIStringEncoding];
AUTORELEASE(tmp);
return [tmp lowercaseString];
}
buflen = 0;
quote = 0; // End of quoted section
}
else
{
if (buflen == sizeof(buffer)) buflen = 0;
buffer[buflen++] = c;
}
}
return @"utf-8";
}
/**
* Return a coding context object to be used for decoding data
* according to the scheme specified in the header.
@ -2247,6 +2406,8 @@ NSDebugMLLog(@"GSMime", @"Header parsed - %@", info);
if ([context atEnd] == YES
|| (expect > 0 && rawBodyLength >= expect))
{
NSString *subtype = [typeInfo objectForKey: @"Subtype"];
flags.inBody = 0;
flags.complete = 1;
@ -2260,101 +2421,56 @@ NSDebugMLLog(@"GSMime", @"Header parsed - %@", info);
if ([document contentFile] != nil)
{
type = @"application";
subtype= @"octet-stream";
}
else
{
type = @"text";
subtype= @"plain";
}
}
if ([type isEqualToString: @"text"] == YES)
if ([type isEqualToString: @"text"] == YES
&& [subtype isEqualToString: @"xml"] == NO)
{
NSStringEncoding stringEncoding;
NSStringEncoding stringEncoding = _defaultEncoding;
NSString *string;
if (typeInfo == nil)
{
stringEncoding = _defaultEncoding;
if (stringEncoding != NSASCIIStringEncoding)
{
NSString *charset;
if (typeInfo == nil)
{
typeInfo = [GSMimeHeader new];
[typeInfo setName: @"content-type"];
[typeInfo setValue: @"text/plain"];
[typeInfo setObject: @"text"
forKey: @"Type"];
[typeInfo setObject: @"plain"
forKey: @"Subtype"];
[document setHeader: typeInfo];
RELEASE(typeInfo);
}
charset = [documentClass charsetFromEncoding:
stringEncoding];
[typeInfo setParameter: charset
forKey: @"charset"];
}
typeInfo = [GSMimeHeader new];
[typeInfo setName: @"content-type"];
[typeInfo setValue: @"text/plain"];
[typeInfo setObject: type forKey: @"Type"];
[typeInfo setObject: subtype forKey: @"Subtype"];
[document setHeader: typeInfo];
RELEASE(typeInfo);
}
else
{
NSString *charset;
NSString *sub;
charset = [typeInfo parameterForKey: @"charset"];
/*
* For 'text/xml' content where charset is not supplied,
* we may be able to get the information from the
* header encoding="..." section.
*/
if (charset == nil
&& (sub = [typeInfo objectForKey: @"Subtype"]) != nil
&& [sub isEqualToString: @"xml"] == YES)
if (charset != nil)
{
const unsigned char *ptr = [data bytes];
unsigned int length = [data length];
unsigned int i;
for (i = 0; i < length - 10 && i < 500; i++)
{
if (memcmp(ptr + i, "encoding=\"", 10) == 0)
{
unsigned start = i + 10;
unsigned end = start;
for (i = start; i < length; i++)
{
if (ptr[i] == '"')
{
end = i;
break;
}
}
if (end > start)
{
NSData *d;
NSString *c;
d = [NSData dataWithBytes: ptr + start
length: end - start];
c = [NSString alloc];
c = [c initWithData: d
encoding: NSASCIIStringEncoding];
if (c != nil)
{
charset = [c lowercaseString];
}
}
break;
}
}
stringEncoding
= [documentClass encodingFromCharset: charset];
}
stringEncoding
= [documentClass encodingFromCharset: charset];
}
/*
* Ensure that the charset reflects the encoding used.
*/
if (stringEncoding != NSASCIIStringEncoding)
{
NSString *charset;
charset = [documentClass charsetFromEncoding:
stringEncoding];
[typeInfo setParameter: charset
forKey: @"charset"];
}
/*
* Assume that content type is best represented as NSString.
*/
@ -4069,6 +4185,20 @@ static NSCharacterSet *tokenSet = nil;
NSString *charset = [hdr parameterForKey: @"charset"];
NSStringEncoding enc;
/*
* Treat text/xml as a special case ... if we have no charset
* specified then we can get the charset from the xml header
* or, if that is not present, xml is utf-8
*/
if (charset == nil
&& [[hdr objectForKey: @"Subtype"] isEqualToString: @"xml"] == YES)
{
charset = [documentClass charsetForXml: content];
if (charset == nil)
{
charset = @"utf-8";
}
}
enc = [documentClass encodingFromCharset: charset];
s = [NSStringClass allocWithZone: NSDefaultMallocZone()];
s = [s initWithData: content encoding: enc];