diff --git a/ChangeLog b/ChangeLog index 685b0088b..27f5ffec9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-05-21 Richard Frith-Macdonald + + * Headers/Additions/GNUstepBase/GSMime.h: Expose xml charset method. + * Source/NSXMLParser.m: Determine correct charset when parsing. + 2010-05-21 Riccardo Mottola * Source/inet_pton.c diff --git a/Headers/Additions/GNUstepBase/GSMime.h b/Headers/Additions/GNUstepBase/GSMime.h index 6212fceb8..2a2842c09 100644 --- a/Headers/Additions/GNUstepBase/GSMime.h +++ b/Headers/Additions/GNUstepBase/GSMime.h @@ -119,6 +119,7 @@ extern "C" { #endif } ++ (NSString*) charsetForXml: (NSData*)xml; + (NSString*) charsetFromEncoding: (NSStringEncoding)enc; /** diff --git a/Source/NSXMLParser.m b/Source/NSXMLParser.m index 2859b98e2..6ab0f1778 100644 --- a/Source/NSXMLParser.m +++ b/Source/NSXMLParser.m @@ -36,6 +36,7 @@ #import "Foundation/NSDictionary.h" #import "Foundation/NSNull.h" #import "GNUstepBase/NSObject+GNUstepBase.h" +#import "GNUstepBase/GSMime.h" NSString* const NSXMLParserErrorDomain = @"NSXMLParserErrorDomain"; @@ -723,13 +724,37 @@ static SEL foundCommentSel; self = [super init]; if (self) { + NSStringEncoding enc; + _parser = NSZoneMalloc([self zone], sizeof(NSXMLParserIvars)); memset(_parser, '\0', sizeof(NSXMLParserIvars)); - this->data = [data copy]; + /* Determine character encoding and convert to utf-8 if needed. + */ + enc = [GSMimeDocument encodingFromCharset: + [GSMimeDocument charsetForXml: data]]; + if (enc == NSUTF8StringEncoding || enc == NSASCIIStringEncoding) + { + this->data = [data copy]; + } + else + { + NSString *tmp; + + tmp = [[NSString alloc] initWithData: data encoding: enc]; + data = [[tmp dataUsingEncoding: NSUTF8StringEncoding] retain]; + [tmp release]; + } this->tagPath = [[NSMutableArray alloc] init]; this->namespaces = [[NSMutableArray alloc] init]; this->cp = [this->data bytes]; this->cend = this->cp + [this->data length]; + /* If the data contained utf-8 with a BOM, we must skip it. + */ + if ((this->cend - this->cp) > 2 && this->cp[0] == 0xef + && this->cp[1] == 0xbb && this->cp[2] == 0xbf) + { + this->cp += 3; // Skip BOM + } } } return self;