diff --git a/ChangeLog b/ChangeLog index 6dbb30b70..d2b51dae2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2006-12-26 Dr. H. Nikolaus Schaller, Richard Frith-Macdonald + + * Source/NSXMLParser.m: Implement reduced functionality parser if + LIBXML2 is not available. + 2006-12-26 Richard Frith-Macdonald * Headers/Foundation/NSValueTransformer.h: diff --git a/Source/NSXMLParser.m b/Source/NSXMLParser.m index a2d612bd1..b8fcf3235 100644 --- a/Source/NSXMLParser.m +++ b/Source/NSXMLParser.m @@ -23,7 +23,8 @@ */ -#include +#include "config.h" +#include #include #include #include @@ -32,6 +33,10 @@ NSString* const NSXMLParserErrorDomain = @"NSXMLParserErrorDomain"; +#ifdef HAVE_LIBXML + +#include + @interface NSXMLSAXHandler : GSSAXHandler { @public @@ -435,6 +440,547 @@ NSString* const NSXMLParserErrorDomain = @"NSXMLParserErrorDomain"; @end +#else + +@implementation NSString (NSXMLParser) + +- (NSString *) _stringByExpandingXMLEntities +{ + NSMutableString *t=[NSMutableString stringWithString: self]; + [t replaceOccurrencesOfString: @"&" withString: @"&" options: 0 range: NSMakeRange(0, [t length])]; // must be first! + [t replaceOccurrencesOfString: @"<" withString: @"<" options: 0 range: NSMakeRange(0, [t length])]; + [t replaceOccurrencesOfString: @">" withString: @">" options: 0 range: NSMakeRange(0, [t length])]; + [t replaceOccurrencesOfString: @"\"" withString: @""" options: 0 range: NSMakeRange(0, [t length])]; + [t replaceOccurrencesOfString: @"'" withString: @"'" options: 0 range: NSMakeRange(0, [t length])]; + return t; +} + +@end + +static NSString *UTF8STR(const void *ptr, int len) +{ + NSString *s; + + s = [[NSString alloc] initWithBytes: ptr + length: len + encoding: NSUTF8StringEncoding]; + if (s == nil) + NSLog(@"could not convert to UTF8 string! bytes=%08x len=%d", ptr, len); + return AUTORELEASE(s); +} + +typedef struct NSXMLParserIvarsType +{ + NSMutableArray *tagPath; // hierarchy of tags + NSData *data; + NSError *error; + const unsigned char *cp; // character pointer + const unsigned char *cend; // end of data + int line; // current line (counts from 0) + int column; // current column (counts from 0) + BOOL abort; // abort parse loop + BOOL shouldProcessNamespaces; + BOOL shouldReportNamespacePrefixes; + BOOL shouldResolveExternalEntities; + BOOL acceptHTML; // be lazy with bad tag nesting +} NSXMLParserIvars; + +@implementation NSXMLParser + +#define this ((NSXMLParserIvars*)_parser) +#define _del ((id)_handler) + +- (void) abortParsing +{ + this->abort = YES; +} + +- (int) columnNumber +{ + return this->column; +} + +- (void) dealloc; +{ + if (this != 0) + { + RELEASE(this->data); + RELEASE(this->error); + RELEASE(this->tagPath); + NSZoneFree([self zone], this); + } + [super dealloc]; +} + +- (id) delegate +{ + return _del; +} + +- (id) initWithContentsOfURL: (NSURL *)url +{ + return [self initWithData: [NSData dataWithContentsOfURL: url]]; +} + +- (id) initWithData: (NSData *)data +{ + if (data == nil) + { + DESTROY(self); + } + else + { + self = [super init]; + if (self) + { + _parser = NSZoneMalloc([self zone], sizeof(NSXMLParserIvars)); + memset(_parser, '\0', sizeof(NSXMLParserIvars)); + this->data = [data copy]; + this->tagPath = [[NSMutableArray alloc] init]; + this->cp = [this->data bytes]; + this->cend = this->cp + [this->data length]; + } + } + return self; +} + +- (int) lineNumber +{ + return this->line; +} + +- (void) setDelegate: (id)del +{ + _handler = del; +} + +- (NSError *) parserError +{ + return this->error; +} + +- (NSArray *) _tagPath +{ + return this->tagPath; +} + +#define cget() ((this->cp < this->cend)?(this->column++, *this->cp++): -1) + +- (BOOL) _parseError: (NSString *)message +{ +#if 0 + NSLog(@"XML parseError: %@", message); +#endif + NSError *err = nil; + + ASSIGN(this->error, err); + this->abort = YES; // break look + if ([_del respondsToSelector: @selector(parser:parseErrorOccurred:)]) + [_del parser: self parseErrorOccurred: this->error]; // pass error + return NO; +} + +- (void) _processTag: (NSString *)tag + isEnd: (BOOL)flag + withAttributes: (NSDictionary *)attributes +{ +#if 0 + NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes); +#endif + if (this->acceptHTML) + tag = [tag lowercaseString]; // not case sensitive + if (!flag) + { + if ([tag isEqualToString: @"?xml"]) + { + // parse, i.e. check for UTF8 encoding and other attributes + #if 0 + NSLog(@"parserDidStartDocument: "); + #endif + if ([_del respondsToSelector: @selector(parserDidStartDocument:)]) + [_del parserDidStartDocument: self]; + return; + } + if ([tag hasPrefix: @"?"]) + { + #if 1 + NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes); + #endif + // parser: foundProcessingInstructionWithTarget: data: + return; + } + if ([tag isEqualToString: @"!DOCTYPE"]) + { + // parse and might load + #if 1 + NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes); + #endif + return; + } + if ([tag isEqualToString: @"!ENTITY"]) + { + // parse + #if 1 + NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes); + #endif + return; + } + if ([tag isEqualToString: @"!CDATA"]) + { + // pass through as NSData + // parser: foundCDATA: + #if 1 + NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes); + #endif + return; + } + [this->tagPath addObject: tag]; // push on stack + if ([_del respondsToSelector:@selector(parser:didStartElement:namespaceURI:qualifiedName:attributes:)]) + [_del parser: self didStartElement: tag namespaceURI: nil qualifiedName: nil attributes: attributes]; + } + else + { +// closing tag + if (this->acceptHTML) + { +// lazily close any missing tags on stack + while([this->tagPath count] > 0 && ![[this->tagPath lastObject] isEqualToString: tag]) // must be literally equal! + { + if ([_del respondsToSelector: @selector(parser: didEndElement: namespaceURI: qualifiedName: )]) + [_del parser: self didEndElement: [this->tagPath lastObject] namespaceURI: nil qualifiedName: nil]; + [this->tagPath removeLastObject]; // pop from stack + } + if ([this->tagPath count] == 0) + return; // ignore closing tag without matching open... + } + else if (![[this->tagPath lastObject] isEqualToString: tag]) // must be literally equal! + { + [self _parseError: [NSString stringWithFormat: @"tag nesting error ( expected, found)", [this->tagPath lastObject], tag]]; + return; + } + if ([_del respondsToSelector: @selector(parser: didEndElement: namespaceURI: qualifiedName: )]) + [_del parser: self didEndElement: tag namespaceURI: nil qualifiedName: nil]; + [this->tagPath removeLastObject]; // pop from stack + } +} + +- (NSString *) _entity; +{ +// parse &xxx; sequence + int c; + const unsigned char *ep = this->cp; // should be position behind & + int len; + unsigned int val; + NSString *entity; + + do { + c = cget(); + } while(c != EOF && c != '<' && c != ';'); + + if (c != ';') + return nil; // invalid sequence - end of file or missing ; before next tag + len = this->cp - ep - 1; + if (*ep == '#') + { +// &#ddd; or &#xhh; + // !!! ep+1 is not 0-terminated - but by ;!! + if (sscanf((char *)ep+1, "x%x;", &val)) + return [NSString stringWithFormat: @"%C", val]; // &#xhh; hex value + else if (sscanf((char *)ep+1, "%d;", &val)) + return [NSString stringWithFormat: @"%C", val]; // &ddd; decimal value + } + else + { +// the five predefined entities + if (len == 3 && strncmp((char *)ep, "amp", len) == 0) + return @"&"; + if (len == 2 && strncmp((char *)ep, "lt", len) == 0) + return @"<"; + if (len == 2 && strncmp((char *)ep, "gt", len) == 0) + return @">"; + if (len == 4 && strncmp((char *)ep, "quot", len) == 0) + return @"\""; + if (len == 4 && strncmp((char *)ep, "apos", len) == 0) + return @"'"; + } + entity = UTF8STR(ep, len); +#if 1 + NSLog(@"NSXMLParser: unrecognized entity: &%@;", entity); +#endif +// entity=[entitiesTable objectForKey: entity]; // look up string in entity translation table + if (!entity) + entity=@"&??;"; // unknown entity + return entity; +} + +- (NSString *) _qarg; +{ +// get argument (might be quoted) + const unsigned char *ap = --this->cp; // argument start pointer + int c = cget(); // refetch first character + +#if 0 + NSLog(@"_qarg: %02x %c", c, isprint(c)?c: ' '); +#endif + if (c == '\"') + { +// quoted argument + do { + c = cget(); + if (c == EOF) + return nil; // unterminated! + } while(c != '\"'); + return UTF8STR(ap + 1, this->cp - ap - 2); + } + if (c == '\'') + { +// apostrophed argument + do { + c = cget(); + if (c == EOF) + return nil; // unterminated! + } while(c != '\''); + return UTF8STR(ap + 1, this->cp - ap - 2); + } + if (!this->acceptHTML) + ; // strict XML requires quoting (?) + while(!isspace(c) && c != '>' && c != '/' && c != '?' && c != '=' &&c != EOF) + c = cget(); + this->cp--; // go back to terminating character + return UTF8STR(ap, this->cp - ap); +} + +- (BOOL) parse; +{ +// read XML (or HTML) file + const unsigned char *vp = this->cp; // value pointer + int c; + + if (!this->acceptHTML + && (this->cend - this->cp < 6 + || strncmp((char *)this->cp, " preamble"]; + } + c = cget(); // get first character + while(!this->abort) + { +// parse next element +#if 0 + NSLog(@"_nextelement %02x %c", c, isprint(c)?c: ' '); +#endif + switch(c) + { + case '\r': + this->column = 0; + break; + case '\n': + this->line++; + this->column = 0; + case EOF: + case '<': + case '&': + { +// push out any characters that have been collected so far + if (this->cp - vp > 1) + { + // check for whitespace only - might set/reset a flag to indicate so + if ([_del respondsToSelector: @selector(parser: foundCharacters: )]) + [_del parser: self foundCharacters: UTF8STR(vp, this->cp - vp - 1)]; + vp = this->cp; + } + } + } + switch(c) + { + default: + c = cget(); // just collect until we push out (again) + continue; + case EOF: // end of file + { + if ([this->tagPath count] != 0) + { + if (!this->acceptHTML) + return [self _parseError: @"unexpected end of file"]; // strict XML nesting error + while([this->tagPath count] > 0) + { +// lazily close all open tags + if ([_del respondsToSelector: @selector(parser: didEndElement: namespaceURI: qualifiedName: )]) + [_del parser: self didEndElement: [this->tagPath lastObject] namespaceURI: nil qualifiedName: nil]; + [this->tagPath removeLastObject]; // pop from stack + } + } +#if 0 + NSLog(@"parserDidEndDocument: "); +#endif + + if ([_del respondsToSelector: @selector(parserDidEndDocument: )]) + [_del parserDidEndDocument: self]; + return YES; + } + case '&': + { +// escape entity begins + NSString *entity=[self _entity]; + if (!entity) + return [self _parseError: @"empty entity name"]; + if ([_del respondsToSelector: @selector(parser: foundCharacters: )]) + [_del parser: self foundCharacters: entity]; + vp = this->cp; // next value sequence starts here + c = cget(); // first character behind ; + continue; + } + case '<': + { +// tag begins + NSString *tag; + NSMutableDictionary *parameters; + NSString *arg; + const unsigned char *tp = this->cp; // tag pointer + if (this->cp < this->cend-3 && strncmp((char *)this->cp, "!--", 3) == 0) + { +// start of comment skip all characters until "-->" + this->cp+=3; + while(this->cp < this->cend-3 && strncmp((char *)this->cp, "-->", 3) != 0) + this->cp++; // search + // if _del responds to parser: foundComment: + // convert to string (tp+4 ... cp) + this->cp+=3; // might go beyond cend but does not care + vp = this->cp; // value might continue + c = cget(); // get first character behind comment + continue; + } + c = cget(); // get first character of tag + if (c == '/') + c = cget(); // closing tag should process this tag in a special way so that e.g. is read as a single tag! + // to do this properly, we need a notion of comments and quoted string constants... + } + while(!isspace(c) && c != '>' && (c != '/') && (c != '?')) + c = cget(); // scan tag until we find a delimiting character + if (*tp == '/') + tag = UTF8STR(tp + 1, this->cp - tp - 2); // don't include / and delimiting character + else + tag = UTF8STR(tp, this->cp - tp - 1); // don't include delimiting character +#if 0 + NSLog(@"tag=%@ - %02x %c", tag, c, isprint(c)?c: ' '); +#endif + parameters=[NSMutableDictionary dictionaryWithCapacity: 5]; + while(c != EOF) + { +// collect arguments + if (c == '/' && *tp != '/') + { +// appears to be a /> + c = cget(); + if (c != '>') + return [self _parseError: @""]; + [self _processTag: tag isEnd: NO withAttributes: parameters]; // opening tag + [self _processTag: tag isEnd: YES withAttributes: nil]; // closing tag + break; // done + } + if (c == '?' && *tp == '?') + { +// appears to be a ?> + c = cget(); + if (c != '>') + return [self _parseError: @""]; + // process + [self _processTag: tag isEnd: NO withAttributes: parameters]; // single + break; // done + } + while(isspace(c)) // this->should also allow for line break and tab + c = cget(); + if (c == '>') + { + [self _processTag: tag isEnd: (*tp=='/') withAttributes: parameters]; // handle tag + break; + } + arg=[self _qarg]; // get next argument (eats up to /, ?, >, =, space) +#if 0 + NSLog(@"arg=%@", arg); +#endif + if (!this->acceptHTML && [arg length] == 0) + return [self _parseError: @"empty attribute name"]; + c = cget(); // get delimiting character + if (c == '=') + { +// explicit assignment + c = cget(); // skip = + [parameters setObject: [self _qarg] forKey: arg]; + c = cget(); // get character behind qarg value + } + else // implicit + [parameters setObject: @"" forKey: arg]; + } + vp = this->cp; // prepare for next value + c = cget(); // skip > and fetch next character + } + } + } + return [self _parseError: @"this->aborted"]; // this->aborted +} + +- (BOOL) acceptsHTML +{ + return this->acceptHTML; +} + +- (BOOL) shouldProcessNamespaces +{ + return this->shouldProcessNamespaces; +} + +- (BOOL) shouldReportNamespacePrefixes +{ + return this->shouldReportNamespacePrefixes; +} + +- (BOOL) shouldResolveExternalEntities +{ + return this->shouldResolveExternalEntities; +} + +- (void) setShouldProcessNamespaces: (BOOL)flag +{ + this->shouldProcessNamespaces = flag; +} + +- (void) setShouldReportNamespacePrefixes: (BOOL)flag +{ + this->shouldReportNamespacePrefixes = flag; +} + +- (void) setShouldResolveExternalEntities: (BOOL)flag +{ + this->shouldProcessNamespaces = flag; +} + +- (void) _setAcceptHTML: (BOOL) flag +{ + this->acceptHTML = flag; +} + +- (NSString *) publicID +{ + return [self notImplemented: _cmd]; +} + +- (NSString *) systemID +{ + return [self notImplemented: _cmd]; +} + +@end + +#endif + @implementation NSObject (NSXMLParserDelegateEventAdditions) - (NSData*) parser: (NSXMLParser*)aParser resolveExternalEntityName: (NSString*)aName