Add rudimentary parser implementation for when libxml2 is not available.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@24240 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Richard Frith-MacDonald 2006-12-26 07:00:41 +00:00
parent 73f83c5c59
commit 06e90f42d7
2 changed files with 552 additions and 1 deletions

View file

@ -1,3 +1,8 @@
2006-12-26 Dr. H. Nikolaus Schaller, Richard Frith-Macdonald
* Source/NSXMLParser.m: Implement reduced functionality parser if
LIBXML2 is not available.
2006-12-26 Richard Frith-Macdonald <rfm@gnu.org>
* Headers/Foundation/NSValueTransformer.h:

View file

@ -23,7 +23,8 @@
*/
#include <Additions/GNUstepBase/GSXML.h>
#include "config.h"
#include <Foundation/NSArray.h>
#include <Foundation/NSError.h>
#include <Foundation/NSException.h>
#include <Foundation/NSXMLParser.h>
@ -32,6 +33,10 @@
NSString* const NSXMLParserErrorDomain = @"NSXMLParserErrorDomain";
#ifdef HAVE_LIBXML
#include <Additions/GNUstepBase/GSXML.h>
@interface NSXMLSAXHandler : GSSAXHandler
{
@public
@ -435,6 +440,547 @@ NSString* const NSXMLParserErrorDomain = @"NSXMLParserErrorDomain";
@end
#else
@implementation NSString (NSXMLParser)
- (NSString *) _stringByExpandingXMLEntities
{
NSMutableString *t=[NSMutableString stringWithString: self];
[t replaceOccurrencesOfString: @"&" withString: @"&amp;" options: 0 range: NSMakeRange(0, [t length])]; // must be first!
[t replaceOccurrencesOfString: @"<" withString: @"&lt;" options: 0 range: NSMakeRange(0, [t length])];
[t replaceOccurrencesOfString: @">" withString: @"&gt;" options: 0 range: NSMakeRange(0, [t length])];
[t replaceOccurrencesOfString: @"\"" withString: @"&quot;" options: 0 range: NSMakeRange(0, [t length])];
[t replaceOccurrencesOfString: @"'" withString: @"&apos;" options: 0 range: NSMakeRange(0, [t length])];
return t;
}
@end
static NSString *UTF8STR(const void *ptr, int len)
{
NSString *s;
s = [[NSString alloc] initWithBytes: ptr
length: len
encoding: NSUTF8StringEncoding];
if (s == nil)
NSLog(@"could not convert to UTF8 string! bytes=%08x len=%d", ptr, len);
return AUTORELEASE(s);
}
typedef struct NSXMLParserIvarsType
{
NSMutableArray *tagPath; // hierarchy of tags
NSData *data;
NSError *error;
const unsigned char *cp; // character pointer
const unsigned char *cend; // end of data
int line; // current line (counts from 0)
int column; // current column (counts from 0)
BOOL abort; // abort parse loop
BOOL shouldProcessNamespaces;
BOOL shouldReportNamespacePrefixes;
BOOL shouldResolveExternalEntities;
BOOL acceptHTML; // be lazy with bad tag nesting
} NSXMLParserIvars;
@implementation NSXMLParser
#define this ((NSXMLParserIvars*)_parser)
#define _del ((id)_handler)
- (void) abortParsing
{
this->abort = YES;
}
- (int) columnNumber
{
return this->column;
}
- (void) dealloc;
{
if (this != 0)
{
RELEASE(this->data);
RELEASE(this->error);
RELEASE(this->tagPath);
NSZoneFree([self zone], this);
}
[super dealloc];
}
- (id) delegate
{
return _del;
}
- (id) initWithContentsOfURL: (NSURL *)url
{
return [self initWithData: [NSData dataWithContentsOfURL: url]];
}
- (id) initWithData: (NSData *)data
{
if (data == nil)
{
DESTROY(self);
}
else
{
self = [super init];
if (self)
{
_parser = NSZoneMalloc([self zone], sizeof(NSXMLParserIvars));
memset(_parser, '\0', sizeof(NSXMLParserIvars));
this->data = [data copy];
this->tagPath = [[NSMutableArray alloc] init];
this->cp = [this->data bytes];
this->cend = this->cp + [this->data length];
}
}
return self;
}
- (int) lineNumber
{
return this->line;
}
- (void) setDelegate: (id)del
{
_handler = del;
}
- (NSError *) parserError
{
return this->error;
}
- (NSArray *) _tagPath
{
return this->tagPath;
}
#define cget() ((this->cp < this->cend)?(this->column++, *this->cp++): -1)
- (BOOL) _parseError: (NSString *)message
{
#if 0
NSLog(@"XML parseError: %@", message);
#endif
NSError *err = nil;
ASSIGN(this->error, err);
this->abort = YES; // break look
if ([_del respondsToSelector: @selector(parser:parseErrorOccurred:)])
[_del parser: self parseErrorOccurred: this->error]; // pass error
return NO;
}
- (void) _processTag: (NSString *)tag
isEnd: (BOOL)flag
withAttributes: (NSDictionary *)attributes
{
#if 0
NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes);
#endif
if (this->acceptHTML)
tag = [tag lowercaseString]; // not case sensitive
if (!flag)
{
if ([tag isEqualToString: @"?xml"])
{
// parse, i.e. check for UTF8 encoding and other attributes
#if 0
NSLog(@"parserDidStartDocument: ");
#endif
if ([_del respondsToSelector: @selector(parserDidStartDocument:)])
[_del parserDidStartDocument: self];
return;
}
if ([tag hasPrefix: @"?"])
{
#if 1
NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes);
#endif
// parser: foundProcessingInstructionWithTarget: data:
return;
}
if ([tag isEqualToString: @"!DOCTYPE"])
{
// parse and might load
#if 1
NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes);
#endif
return;
}
if ([tag isEqualToString: @"!ENTITY"])
{
// parse
#if 1
NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes);
#endif
return;
}
if ([tag isEqualToString: @"!CDATA"])
{
// pass through as NSData
// parser: foundCDATA:
#if 1
NSLog(@"_processTag <%@%@ %@>", flag?@"/": @"", tag, attributes);
#endif
return;
}
[this->tagPath addObject: tag]; // push on stack
if ([_del respondsToSelector:@selector(parser:didStartElement:namespaceURI:qualifiedName:attributes:)])
[_del parser: self didStartElement: tag namespaceURI: nil qualifiedName: nil attributes: attributes];
}
else
{
// closing tag
if (this->acceptHTML)
{
// lazily close any missing tags on stack
while([this->tagPath count] > 0 && ![[this->tagPath lastObject] isEqualToString: tag]) // must be literally equal!
{
if ([_del respondsToSelector: @selector(parser: didEndElement: namespaceURI: qualifiedName: )])
[_del parser: self didEndElement: [this->tagPath lastObject] namespaceURI: nil qualifiedName: nil];
[this->tagPath removeLastObject]; // pop from stack
}
if ([this->tagPath count] == 0)
return; // ignore closing tag without matching open...
}
else if (![[this->tagPath lastObject] isEqualToString: tag]) // must be literally equal!
{
[self _parseError: [NSString stringWithFormat: @"tag nesting error (</%@> expected, </%@> found)", [this->tagPath lastObject], tag]];
return;
}
if ([_del respondsToSelector: @selector(parser: didEndElement: namespaceURI: qualifiedName: )])
[_del parser: self didEndElement: tag namespaceURI: nil qualifiedName: nil];
[this->tagPath removeLastObject]; // pop from stack
}
}
- (NSString *) _entity;
{
// parse &xxx; sequence
int c;
const unsigned char *ep = this->cp; // should be position behind &
int len;
unsigned int val;
NSString *entity;
do {
c = cget();
} while(c != EOF && c != '<' && c != ';');
if (c != ';')
return nil; // invalid sequence - end of file or missing ; before next tag
len = this->cp - ep - 1;
if (*ep == '#')
{
// &#ddd; or &#xhh;
// !!! ep+1 is not 0-terminated - but by ;!!
if (sscanf((char *)ep+1, "x%x;", &val))
return [NSString stringWithFormat: @"%C", val]; // &#xhh; hex value
else if (sscanf((char *)ep+1, "%d;", &val))
return [NSString stringWithFormat: @"%C", val]; // &ddd; decimal value
}
else
{
// the five predefined entities
if (len == 3 && strncmp((char *)ep, "amp", len) == 0)
return @"&";
if (len == 2 && strncmp((char *)ep, "lt", len) == 0)
return @"<";
if (len == 2 && strncmp((char *)ep, "gt", len) == 0)
return @">";
if (len == 4 && strncmp((char *)ep, "quot", len) == 0)
return @"\"";
if (len == 4 && strncmp((char *)ep, "apos", len) == 0)
return @"'";
}
entity = UTF8STR(ep, len);
#if 1
NSLog(@"NSXMLParser: unrecognized entity: &%@;", entity);
#endif
// entity=[entitiesTable objectForKey: entity]; // look up string in entity translation table
if (!entity)
entity=@"&??;"; // unknown entity
return entity;
}
- (NSString *) _qarg;
{
// get argument (might be quoted)
const unsigned char *ap = --this->cp; // argument start pointer
int c = cget(); // refetch first character
#if 0
NSLog(@"_qarg: %02x %c", c, isprint(c)?c: ' ');
#endif
if (c == '\"')
{
// quoted argument
do {
c = cget();
if (c == EOF)
return nil; // unterminated!
} while(c != '\"');
return UTF8STR(ap + 1, this->cp - ap - 2);
}
if (c == '\'')
{
// apostrophed argument
do {
c = cget();
if (c == EOF)
return nil; // unterminated!
} while(c != '\'');
return UTF8STR(ap + 1, this->cp - ap - 2);
}
if (!this->acceptHTML)
; // strict XML requires quoting (?)
while(!isspace(c) && c != '>' && c != '/' && c != '?' && c != '=' &&c != EOF)
c = cget();
this->cp--; // go back to terminating character
return UTF8STR(ap, this->cp - ap);
}
- (BOOL) parse;
{
// read XML (or HTML) file
const unsigned char *vp = this->cp; // value pointer
int c;
if (!this->acceptHTML
&& (this->cend - this->cp < 6
|| strncmp((char *)this->cp, "<?xml ", 6) != 0))
{
// not a valid XML document start
return [self _parseError: @"missing <?xml > preamble"];
}
c = cget(); // get first character
while(!this->abort)
{
// parse next element
#if 0
NSLog(@"_nextelement %02x %c", c, isprint(c)?c: ' ');
#endif
switch(c)
{
case '\r':
this->column = 0;
break;
case '\n':
this->line++;
this->column = 0;
case EOF:
case '<':
case '&':
{
// push out any characters that have been collected so far
if (this->cp - vp > 1)
{
// check for whitespace only - might set/reset a flag to indicate so
if ([_del respondsToSelector: @selector(parser: foundCharacters: )])
[_del parser: self foundCharacters: UTF8STR(vp, this->cp - vp - 1)];
vp = this->cp;
}
}
}
switch(c)
{
default:
c = cget(); // just collect until we push out (again)
continue;
case EOF: // end of file
{
if ([this->tagPath count] != 0)
{
if (!this->acceptHTML)
return [self _parseError: @"unexpected end of file"]; // strict XML nesting error
while([this->tagPath count] > 0)
{
// lazily close all open tags
if ([_del respondsToSelector: @selector(parser: didEndElement: namespaceURI: qualifiedName: )])
[_del parser: self didEndElement: [this->tagPath lastObject] namespaceURI: nil qualifiedName: nil];
[this->tagPath removeLastObject]; // pop from stack
}
}
#if 0
NSLog(@"parserDidEndDocument: ");
#endif
if ([_del respondsToSelector: @selector(parserDidEndDocument: )])
[_del parserDidEndDocument: self];
return YES;
}
case '&':
{
// escape entity begins
NSString *entity=[self _entity];
if (!entity)
return [self _parseError: @"empty entity name"];
if ([_del respondsToSelector: @selector(parser: foundCharacters: )])
[_del parser: self foundCharacters: entity];
vp = this->cp; // next value sequence starts here
c = cget(); // first character behind ;
continue;
}
case '<':
{
// tag begins
NSString *tag;
NSMutableDictionary *parameters;
NSString *arg;
const unsigned char *tp = this->cp; // tag pointer
if (this->cp < this->cend-3 && strncmp((char *)this->cp, "!--", 3) == 0)
{
// start of comment skip all characters until "-->"
this->cp+=3;
while(this->cp < this->cend-3 && strncmp((char *)this->cp, "-->", 3) != 0)
this->cp++; // search
// if _del responds to parser: foundComment:
// convert to string (tp+4 ... cp)
this->cp+=3; // might go beyond cend but does not care
vp = this->cp; // value might continue
c = cget(); // get first character behind comment
continue;
}
c = cget(); // get first character of tag
if (c == '/')
c = cget(); // closing tag </tag begins
else if (c == '?')
{
// special tag <?tag begins
c = cget(); // include in tag string
// NSLog(@"special tag <? found");
// FIXME: this->should process this tag in a special way so that e.g. <?php any PHP script ?> is read as a single tag!
// to do this properly, we need a notion of comments and quoted string constants...
}
while(!isspace(c) && c != '>' && (c != '/') && (c != '?'))
c = cget(); // scan tag until we find a delimiting character
if (*tp == '/')
tag = UTF8STR(tp + 1, this->cp - tp - 2); // don't include / and delimiting character
else
tag = UTF8STR(tp, this->cp - tp - 1); // don't include delimiting character
#if 0
NSLog(@"tag=%@ - %02x %c", tag, c, isprint(c)?c: ' ');
#endif
parameters=[NSMutableDictionary dictionaryWithCapacity: 5];
while(c != EOF)
{
// collect arguments
if (c == '/' && *tp != '/')
{
// appears to be a />
c = cget();
if (c != '>')
return [self _parseError: @"<tag/ is missing the >"];
[self _processTag: tag isEnd: NO withAttributes: parameters]; // opening tag
[self _processTag: tag isEnd: YES withAttributes: nil]; // closing tag
break; // done
}
if (c == '?' && *tp == '?')
{
// appears to be a ?>
c = cget();
if (c != '>')
return [self _parseError: @"<?tag ...? is missing the >"];
// process
[self _processTag: tag isEnd: NO withAttributes: parameters]; // single <?tag ...?>
break; // done
}
while(isspace(c)) // this->should also allow for line break and tab
c = cget();
if (c == '>')
{
[self _processTag: tag isEnd: (*tp=='/') withAttributes: parameters]; // handle tag
break;
}
arg=[self _qarg]; // get next argument (eats up to /, ?, >, =, space)
#if 0
NSLog(@"arg=%@", arg);
#endif
if (!this->acceptHTML && [arg length] == 0)
return [self _parseError: @"empty attribute name"];
c = cget(); // get delimiting character
if (c == '=')
{
// explicit assignment
c = cget(); // skip =
[parameters setObject: [self _qarg] forKey: arg];
c = cget(); // get character behind qarg value
}
else // implicit
[parameters setObject: @"" forKey: arg];
}
vp = this->cp; // prepare for next value
c = cget(); // skip > and fetch next character
}
}
}
return [self _parseError: @"this->aborted"]; // this->aborted
}
- (BOOL) acceptsHTML
{
return this->acceptHTML;
}
- (BOOL) shouldProcessNamespaces
{
return this->shouldProcessNamespaces;
}
- (BOOL) shouldReportNamespacePrefixes
{
return this->shouldReportNamespacePrefixes;
}
- (BOOL) shouldResolveExternalEntities
{
return this->shouldResolveExternalEntities;
}
- (void) setShouldProcessNamespaces: (BOOL)flag
{
this->shouldProcessNamespaces = flag;
}
- (void) setShouldReportNamespacePrefixes: (BOOL)flag
{
this->shouldReportNamespacePrefixes = flag;
}
- (void) setShouldResolveExternalEntities: (BOOL)flag
{
this->shouldProcessNamespaces = flag;
}
- (void) _setAcceptHTML: (BOOL) flag
{
this->acceptHTML = flag;
}
- (NSString *) publicID
{
return [self notImplemented: _cmd];
}
- (NSString *) systemID
{
return [self notImplemented: _cmd];
}
@end
#endif
@implementation NSObject (NSXMLParserDelegateEventAdditions)
- (NSData*) parser: (NSXMLParser*)aParser
resolveExternalEntityName: (NSString*)aName