mirror of
https://github.com/gnustep/libs-base.git
synced 2025-04-22 16:33:29 +00:00
Improve determination of charset for xml data.
git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@21191 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
parent
1fa4d4de0a
commit
121a44b86a
2 changed files with 212 additions and 74 deletions
|
@ -1,3 +1,11 @@
|
|||
2005-05-07 Richard Frith-Macdonald <rfm@gnu.org>
|
||||
|
||||
* Source/Additions/GSMime.m: Improve handling of text/xml without
|
||||
a charset. Add new method to determine the correct charset in the
|
||||
case where it has to be inferred from the actual data. Follow the
|
||||
instructions in http://www.w3.org/TR/REC-xml/ but be a bit more
|
||||
tolerant. (ebcdic not supported).
|
||||
|
||||
2005-05-04 Richard Frith-Macdonald <rfm@gnu.org>
|
||||
|
||||
* Source/Additions/GSMime.m: If given content of type text/xml without
|
||||
|
|
|
@ -53,6 +53,7 @@
|
|||
#include "config.h"
|
||||
#include <Foundation/Foundation.h>
|
||||
#include "GNUstepBase/GSMime.h"
|
||||
#include "GNUstepBase/GSXML.h"
|
||||
#include "GNUstepBase/GSCategories.h"
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
@ -688,6 +689,164 @@ wordData(NSString *word)
|
|||
return AUTORELEASE([[self alloc] init]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Examine xml data to find out the characterset needed to convert from
|
||||
* binary data to an NSString object.
|
||||
*/
|
||||
+ (NSString*) charsetForXml: (NSData*)xml
|
||||
{
|
||||
unsigned int length = [xml length];
|
||||
const unsigned char *ptr = (const unsigned char*)[xml bytes];
|
||||
const unsigned char *end = ptr + length;
|
||||
unsigned int offset = 0;
|
||||
unsigned int size = 1;
|
||||
unsigned char quote = 0;
|
||||
unsigned char buffer[30];
|
||||
unsigned int buflen = 0;
|
||||
BOOL found = NO;
|
||||
|
||||
if (length < 4)
|
||||
{
|
||||
// Not long enough to determine an encoding
|
||||
return nil;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine encoding using byte-order-mark if present
|
||||
*/
|
||||
if ((ptr[0] == 0xFE && ptr[1] == 0xFF)
|
||||
|| (ptr[0] == 0xFF && ptr[1] == 0xFE))
|
||||
{
|
||||
return @"utf-16";
|
||||
}
|
||||
if (ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF)
|
||||
{
|
||||
return @"utf-8";
|
||||
}
|
||||
if ((ptr[0] == 0x00 && ptr[1] == 0x00)
|
||||
&& ((ptr[2] == 0xFE && ptr[3] == 0xFF)
|
||||
|| (ptr[2] == 0xFF && ptr[3] == 0xFE)))
|
||||
{
|
||||
return @"ucs-4";
|
||||
}
|
||||
|
||||
/*
|
||||
* Look for nul bytes to determine whether this is a four byte
|
||||
* encoding or a two byte encoding (or the default).
|
||||
*/
|
||||
if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0)
|
||||
{
|
||||
offset = 3;
|
||||
size = 4;
|
||||
}
|
||||
else if (ptr[0] == 0 && ptr[1] == 0 && ptr[3] == 0)
|
||||
{
|
||||
offset = 2;
|
||||
size = 4;
|
||||
}
|
||||
else if (ptr[0] == 0 && ptr[2] == 0 && ptr[3] == 0)
|
||||
{
|
||||
offset = 1;
|
||||
size = 4;
|
||||
}
|
||||
else if (ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0)
|
||||
{
|
||||
offset = 0;
|
||||
size = 4;
|
||||
}
|
||||
else if (ptr[0] == 0)
|
||||
{
|
||||
offset = 1;
|
||||
size = 2;
|
||||
}
|
||||
else if (ptr[1] == 0)
|
||||
{
|
||||
offset = 0;
|
||||
size = 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now look for the xml encoding declaration ...
|
||||
*/
|
||||
|
||||
// Tolerate leading whitespace
|
||||
while (ptr + size <= end && isspace(ptr[offset])) ptr += size;
|
||||
|
||||
if (ptr + (size * 20) >= end || ptr[offset] != '<' || ptr[offset+size] != '?')
|
||||
{
|
||||
if (size == 1)
|
||||
{
|
||||
return @"utf-8";
|
||||
}
|
||||
else if (size == 2)
|
||||
{
|
||||
return @"utf-16";
|
||||
}
|
||||
else
|
||||
{
|
||||
return @"ucs-4";
|
||||
}
|
||||
}
|
||||
ptr += size * 5; // Step past '<?xml' prefix
|
||||
|
||||
while (ptr + size <= end)
|
||||
{
|
||||
unsigned char c = ptr[offset];
|
||||
|
||||
ptr += size;
|
||||
if (quote == 0)
|
||||
{
|
||||
if (c == '\'' || c == '"')
|
||||
{
|
||||
buflen = 0;
|
||||
quote = c;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (isspace(c) || c == '=')
|
||||
{
|
||||
if (buflen == 8)
|
||||
{
|
||||
buffer[8] = '\0';
|
||||
if (strcasecmp(buffer, "encoding") == 0)
|
||||
{
|
||||
found = YES;
|
||||
}
|
||||
}
|
||||
buflen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (buflen == sizeof(buffer)) buflen = 0;
|
||||
buffer[buflen++] = c;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (c == quote)
|
||||
{
|
||||
if (found == YES)
|
||||
{
|
||||
NSString *tmp;
|
||||
|
||||
tmp = [[NSString alloc] initWithBytes: buffer
|
||||
length: buflen
|
||||
encoding: NSASCIIStringEncoding];
|
||||
AUTORELEASE(tmp);
|
||||
return [tmp lowercaseString];
|
||||
}
|
||||
buflen = 0;
|
||||
quote = 0; // End of quoted section
|
||||
}
|
||||
else
|
||||
{
|
||||
if (buflen == sizeof(buffer)) buflen = 0;
|
||||
buffer[buflen++] = c;
|
||||
}
|
||||
}
|
||||
|
||||
return @"utf-8";
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a coding context object to be used for decoding data
|
||||
* according to the scheme specified in the header.
|
||||
|
@ -2247,6 +2406,8 @@ NSDebugMLLog(@"GSMime", @"Header parsed - %@", info);
|
|||
if ([context atEnd] == YES
|
||||
|| (expect > 0 && rawBodyLength >= expect))
|
||||
{
|
||||
NSString *subtype = [typeInfo objectForKey: @"Subtype"];
|
||||
|
||||
flags.inBody = 0;
|
||||
flags.complete = 1;
|
||||
|
||||
|
@ -2260,101 +2421,56 @@ NSDebugMLLog(@"GSMime", @"Header parsed - %@", info);
|
|||
if ([document contentFile] != nil)
|
||||
{
|
||||
type = @"application";
|
||||
subtype= @"octet-stream";
|
||||
}
|
||||
else
|
||||
{
|
||||
type = @"text";
|
||||
subtype= @"plain";
|
||||
}
|
||||
}
|
||||
|
||||
if ([type isEqualToString: @"text"] == YES)
|
||||
if ([type isEqualToString: @"text"] == YES
|
||||
&& [subtype isEqualToString: @"xml"] == NO)
|
||||
{
|
||||
NSStringEncoding stringEncoding;
|
||||
NSStringEncoding stringEncoding = _defaultEncoding;
|
||||
NSString *string;
|
||||
|
||||
if (typeInfo == nil)
|
||||
{
|
||||
stringEncoding = _defaultEncoding;
|
||||
if (stringEncoding != NSASCIIStringEncoding)
|
||||
{
|
||||
NSString *charset;
|
||||
|
||||
if (typeInfo == nil)
|
||||
{
|
||||
typeInfo = [GSMimeHeader new];
|
||||
[typeInfo setName: @"content-type"];
|
||||
[typeInfo setValue: @"text/plain"];
|
||||
[typeInfo setObject: @"text"
|
||||
forKey: @"Type"];
|
||||
[typeInfo setObject: @"plain"
|
||||
forKey: @"Subtype"];
|
||||
[document setHeader: typeInfo];
|
||||
RELEASE(typeInfo);
|
||||
}
|
||||
charset = [documentClass charsetFromEncoding:
|
||||
stringEncoding];
|
||||
[typeInfo setParameter: charset
|
||||
forKey: @"charset"];
|
||||
}
|
||||
typeInfo = [GSMimeHeader new];
|
||||
[typeInfo setName: @"content-type"];
|
||||
[typeInfo setValue: @"text/plain"];
|
||||
[typeInfo setObject: type forKey: @"Type"];
|
||||
[typeInfo setObject: subtype forKey: @"Subtype"];
|
||||
[document setHeader: typeInfo];
|
||||
RELEASE(typeInfo);
|
||||
}
|
||||
else
|
||||
{
|
||||
NSString *charset;
|
||||
NSString *sub;
|
||||
|
||||
charset = [typeInfo parameterForKey: @"charset"];
|
||||
|
||||
/*
|
||||
* For 'text/xml' content where charset is not supplied,
|
||||
* we may be able to get the information from the
|
||||
* header encoding="..." section.
|
||||
*/
|
||||
if (charset == nil
|
||||
&& (sub = [typeInfo objectForKey: @"Subtype"]) != nil
|
||||
&& [sub isEqualToString: @"xml"] == YES)
|
||||
if (charset != nil)
|
||||
{
|
||||
const unsigned char *ptr = [data bytes];
|
||||
unsigned int length = [data length];
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < length - 10 && i < 500; i++)
|
||||
{
|
||||
if (memcmp(ptr + i, "encoding=\"", 10) == 0)
|
||||
{
|
||||
unsigned start = i + 10;
|
||||
unsigned end = start;
|
||||
|
||||
for (i = start; i < length; i++)
|
||||
{
|
||||
if (ptr[i] == '"')
|
||||
{
|
||||
end = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (end > start)
|
||||
{
|
||||
NSData *d;
|
||||
NSString *c;
|
||||
|
||||
d = [NSData dataWithBytes: ptr + start
|
||||
length: end - start];
|
||||
c = [NSString alloc];
|
||||
c = [c initWithData: d
|
||||
encoding: NSASCIIStringEncoding];
|
||||
if (c != nil)
|
||||
{
|
||||
charset = [c lowercaseString];
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
stringEncoding
|
||||
= [documentClass encodingFromCharset: charset];
|
||||
}
|
||||
|
||||
stringEncoding
|
||||
= [documentClass encodingFromCharset: charset];
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the charset reflects the encoding used.
|
||||
*/
|
||||
if (stringEncoding != NSASCIIStringEncoding)
|
||||
{
|
||||
NSString *charset;
|
||||
|
||||
charset = [documentClass charsetFromEncoding:
|
||||
stringEncoding];
|
||||
[typeInfo setParameter: charset
|
||||
forKey: @"charset"];
|
||||
}
|
||||
|
||||
/*
|
||||
* Assume that content type is best represented as NSString.
|
||||
*/
|
||||
|
@ -4069,6 +4185,20 @@ static NSCharacterSet *tokenSet = nil;
|
|||
NSString *charset = [hdr parameterForKey: @"charset"];
|
||||
NSStringEncoding enc;
|
||||
|
||||
/*
|
||||
* Treat text/xml as a special case ... if we have no charset
|
||||
* specified then we can get the charset from the xml header
|
||||
* or, if that is not present, xml is utf-8
|
||||
*/
|
||||
if (charset == nil
|
||||
&& [[hdr objectForKey: @"Subtype"] isEqualToString: @"xml"] == YES)
|
||||
{
|
||||
charset = [documentClass charsetForXml: content];
|
||||
if (charset == nil)
|
||||
{
|
||||
charset = @"utf-8";
|
||||
}
|
||||
}
|
||||
enc = [documentClass encodingFromCharset: charset];
|
||||
s = [NSStringClass allocWithZone: NSDefaultMallocZone()];
|
||||
s = [s initWithData: content encoding: enc];
|
||||
|
|
Loading…
Reference in a new issue