mirror of
https://github.com/gnustep/libs-gsweb.git
synced 2025-02-22 19:21:23 +00:00
ADDITIONAL_INCLUDE_DIRS and ADDITIONAL_LIB_DIRS. * Testing/DynamicElements/Makefile.preamble: Ditto. * GSWExtensionsGSW.framework/Makefile.preamble: Ditto. Also remove comments. * GSWeb.framework/Makefile.preamble.in: Ditto. Also correct ADDITIONAL_CPPFLAGS * Testing/DynamicElements/GNUmakefile: Remove unused variables. * GSWeb.framework/GNUmakefile: Ditto. * GSWExtensions.framework/Makefile.postamble: Removed unused targets 'regenerate' and 'copy-dist'. * GSWExtensionsGSW.framework/Makefile.postamble: Ditto. * Testing/DynamicElements/Makefile.postamble: Ditto. * GSWeb.framework/Makefile.postamble: Ditto. Add rule for GNUSTEP_TARGET_DIR dependent 'config.h'. * Testing/DynamicElements/Resources/Info-DynamicElements.plist: Correct spelling. * Testing/DynamicElements/DynamicElements.m: Replace #import with #include. * Testing/DynamicElements/DynamicElements_main.m: Ditto. * Testing/DynamicElements/Main.h: Ditto. * Testing/DynamicElements/Main.m: Ditto. * Examples/hello/Hello.m: Ditto. * Examples/hello/HelloPage.h: Ditto. * Examples/hello/HelloPage.m: Ditto. * Examples/hello/Hello_main.m: Ditto. * Examples/hello/Main.h: Ditto. * Examples/hello/Main.m: Ditto. * Examples/hello/Resources/Info-Hello.plist: Reformatted. * GSWeb.framework/GSWDebug.h/m (GSWLogAssertGoodFn): Adjust prototype to use id instead of NSObject * and dereference via class_pointer instead of isa. ([NSObject isa]): Remove obsolete category. * GSWeb.framework/GSWPageDefLexer.h: Use "GSWeb.h" instead of <GSWeb/GSWeb.h> for include. * GSWeb.framework/GSWPageDefLexer.m: Ditto. * GSWeb.framework/GSWPageDefParser.h: Ditto. * GSWeb.framework/GSWPageDefParser.m: Ditto. * GSWeb.framework/GSWPageDefParserTokenTypes.h: Ditto. * GSWeb.framework/PageDef.g: Ditto. * GSWeb.framework/html.g: Ditto. * GSWeb.framework/htmltag.g: Ditto. * GSWeb.framework/GSWWOCompatibility.m: Remove file. git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/gsweb/trunk@18338 72102866-910b-0410-8b05-ffd578937521
335 lines
7.5 KiB
Text
335 lines
7.5 KiB
Text
/*
|
|
Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
|
|
Alexander Hinds & Terence Parr
|
|
Magelang Institute, Ltd.
|
|
Send comments to: parrt@parr-research.com
|
|
|
|
v1.0 Terence John Parr (version 2.5.0 of ANTLR required)
|
|
|
|
Fixed how whitespace as handled, removing some ambiguities; some
|
|
because of ANTLR lexical filtering in 2.5.0.
|
|
|
|
Changed (TEXT)* loops to (TEXT)? general since TEXT matches
|
|
everything between valid tags (how could there be more than one
|
|
between tags?)
|
|
|
|
Made the DOCTYPE optional.
|
|
|
|
Reduced lookahead from k=5 to k=1 on the parser and number
|
|
of parser ambiguities to 2. Reduced lexer lookahead from 6
|
|
to 4; had to left factor a bunch of stuff.
|
|
|
|
List items couldn't contain nested lists...fixed it.
|
|
|
|
Fixed def of WORD so it can't be an INT. Removed '-' from WORD.
|
|
|
|
Fixed HEXNUM so it will allow letters A..F.
|
|
|
|
KNOWN ISSUES:
|
|
|
|
1. Does not handle "staggered" tags, eg: <p> <i> <p> <i>
|
|
|
|
2. Adhere's somewhat strictly to the html spec, so many pages
|
|
won't parse without errors.
|
|
|
|
3. Doesn't convert &(a signifier) to it's proper single char
|
|
representation
|
|
|
|
4. Checks only the syntax of element attributes, not the semantics,
|
|
e.g. won't very that a base element's attribute is actually
|
|
called "href"
|
|
|
|
5. Tags split across lines, for example, <A (NEWLINE) some text >
|
|
won't be properly recognized. TJP: I think I fixed this.
|
|
|
|
7. Lines not counted properly due to the def'n of TEXT - see the
|
|
alternate def'n for a possible fix. TJP: I think I fixed this.
|
|
|
|
*/
|
|
header {
|
|
#include "GSWeb.h"
|
|
}
|
|
|
|
options {
|
|
language="Objc";
|
|
}
|
|
|
|
class GSWHTMLParser extends Parser;
|
|
options {
|
|
exportVocab=GSWHTML;
|
|
buildAST=true;
|
|
k = 1;
|
|
}
|
|
{
|
|
NSMutableArray* errors;
|
|
NSMutableArray* warnings;
|
|
}
|
|
|
|
|
|
document
|
|
{
|
|
DESTROY(errors);
|
|
DESTROY(warnings);
|
|
}
|
|
: (
|
|
WS
|
|
| TEXT
|
|
| ot:OPENTAG { [ot_AST setText:[[[ot_AST text] stringByDeletingPrefix:@"<"] stringByDeletingSuffix:@">"]]; }
|
|
| ct:CLOSETAG { [ct_AST setText:[[[ct_AST text] stringByDeletingPrefix:@"</"] stringbyDeletingSuffix:@">"]]; }
|
|
| com:COMMENT { [com_AST setText:[[[com_AST text] stringByDeletingPrefix:@"<!--"] stringByDeletingSuffix:@"-->"]]; }
|
|
/* | inc:INCLUDE { [inc_AST setText:[[[inc_AST text] stringByDeletingPrefix:@"<#include "] stringByDeletingSuffix:@"#>"]]; }*/
|
|
)+
|
|
;
|
|
/*4*/
|
|
class GSWHTMLLexer extends Lexer;
|
|
options {
|
|
k = 3;
|
|
exportVocab=GSWHTML;
|
|
charVocabulary = '\3'..'\377';
|
|
caseSensitive=true;
|
|
filter=UNDEFINED_TOKEN;
|
|
}
|
|
|
|
|
|
/* STRUCTURAL tags
|
|
*/
|
|
|
|
OPENTAG
|
|
/*LAST : '<' (WORD | LETTER) (WS (ATTR )*)? '>'*/
|
|
: '<' (WORD | LETTER) ((WS)* ((ATTR) (WS)*)*)? '>'
|
|
;
|
|
|
|
CLOSETAG
|
|
: "</" (WORD | LETTER) '>'
|
|
;
|
|
|
|
protected
|
|
ATTR
|
|
/* LAST options {
|
|
ignore=WS;
|
|
}*/
|
|
/* : WORD ('=' (WORD ('%')? | ('-')? INT | STRING | HEXNUM))?*/
|
|
: WORD ( (WS)* '=' (WS)* (WORD | ('-')? INT ('%')? | STRING | HEXNUM))?
|
|
;
|
|
|
|
/*MISC STUFF*/
|
|
|
|
TEXT
|
|
/* : (WS | ~('<'|'\n'|'\r'|'"'|'>'))+ */
|
|
: (WS | ~('<'|'\n'|'\r'|'>'))+
|
|
;
|
|
|
|
/* : (
|
|
See comment in WS. Language for combining any flavor
|
|
newline is ambiguous. Shutting off the warning.
|
|
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: '\r' '\n' {[self newline];}
|
|
| '\r' {[self newline];}
|
|
| '\n' {[self newline];}
|
|
| ~('<'|'\n'|'\r'|'"'|'>')
|
|
)+
|
|
;
|
|
*/
|
|
|
|
COMMENT
|
|
: "<!--" (COMMENT_DATA)? "-->"
|
|
;
|
|
/*
|
|
INCLUDE
|
|
: "<#include " (INCLUDE_DATA)* "#>"
|
|
;
|
|
*/
|
|
protected
|
|
/*
|
|
INCLUDE_DATA
|
|
: (~("#>"))+
|
|
;
|
|
*/
|
|
/*
|
|
COMMENT_DATA
|
|
: (~('<' | '!' | '>'))+
|
|
;
|
|
*/
|
|
|
|
COMMENT_DATA
|
|
: ( /* '\r' '\n' can be matched in one alternative or by matching
|
|
'\r' in one iteration and '\n' in another. I am trying to
|
|
handle any flavor of newline that comes in, but the language
|
|
that allows both "\r\n" and "\r" and "\n" to all be valid
|
|
newline is ambiguous. Consequently, the resulting grammar
|
|
must be ambiguous. I'm shutting this warning off.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
:
|
|
{[self LA:2]!='-' && [self LA:3]!='>'}? '-' // allow '-' if not "-->"
|
|
| '\r' '\n' {[self newline];}
|
|
| '\r' {[self newline];}
|
|
| '\n' {[self newline];}
|
|
| ~('-'|'\n'|'\r')
|
|
)*
|
|
;
|
|
|
|
|
|
|
|
/*
|
|
PROTECTED LEXER RULES
|
|
*/
|
|
|
|
protected
|
|
WS : (
|
|
/* '\r' '\n' can be matched in one alternative or by matching
|
|
'\r' in one iteration and '\n' in another. I am trying to
|
|
handle any flavor of newline that comes in, but the language
|
|
that allows both "\r\n" and "\r" and "\n" to all be valid
|
|
newline is ambiguous. Consequently, the resulting grammar
|
|
must be ambiguous. I'm shutting this warning off.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: ' '
|
|
| '\t'
|
|
| '\n' { [self newline]; }
|
|
| "\r\n" { [self newline]; }
|
|
| '\r' { [self newline]; }
|
|
)+
|
|
;
|
|
|
|
//the '.' is for words like "image.gif"
|
|
protected
|
|
WORD: ( LETTER
|
|
| '.'
|
|
| '_'
|
|
)
|
|
|
|
(
|
|
/* In reality, a WORD must be followed by whitespace, '=', or
|
|
what can follow an ATTR such as '>'. In writing this grammar,
|
|
however, we just list all the possibilities as optional
|
|
elements. This is loose, allowing the case where nothing is
|
|
matched after a WORD and then the (ATTR)* loop means the
|
|
grammar would allow "widthheight" as WORD WORD or WORD, hence,
|
|
an ambiguity. Naturally, ANTLR will consume the input as soon
|
|
as possible, combing "widthheight" into one WORD.
|
|
|
|
I am shutting off the ambiguity here because ANTLR does the
|
|
right thing. The exit path is ambiguous with ever
|
|
alternative. The only solution would be to write an unnatural
|
|
grammar (lots of extra productions) that laid out the
|
|
possibilities explicitly, preventing the bogus WORD followed
|
|
immediately by WORD without whitespace etc...
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: LETTER
|
|
| DIGIT
|
|
| '.'
|
|
| '-'
|
|
| '_'
|
|
)+
|
|
;
|
|
|
|
protected
|
|
STRING
|
|
: '"' (~'"')* '"'
|
|
| '\'' (~'\'')* '\''
|
|
;
|
|
|
|
protected
|
|
WSCHARS
|
|
: ' '
|
|
| '\t'
|
|
| '\n' { [self newline]; }
|
|
| '\r' { [self newline]; }
|
|
;
|
|
|
|
protected
|
|
SPECIAL
|
|
: '<' | '~'
|
|
;
|
|
|
|
protected
|
|
HEXNUM
|
|
: '#' HEXINT
|
|
;
|
|
|
|
protected
|
|
INT : (DIGIT)+
|
|
;
|
|
|
|
protected
|
|
HEXINT
|
|
: (
|
|
/* Technically, HEXINT cannot be followed by a..f, but due to our
|
|
loose grammar, the whitespace that normally would follow this
|
|
rule is optional. ANTLR reports that #4FACE could parse as
|
|
HEXINT "#4" followed by WORD "FACE", which is clearly bogus.
|
|
ANTLR does the right thing by consuming a much input as
|
|
possible here. I shut the warning off.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: HEXDIGIT
|
|
)+
|
|
;
|
|
|
|
protected
|
|
DIGIT
|
|
: '0'..'9'
|
|
;
|
|
|
|
protected
|
|
HEXDIGIT
|
|
: ('0'..'9'|'A'..'F'|'a'..'f')
|
|
;
|
|
|
|
protected
|
|
LCLETTER
|
|
: 'a'..'z'
|
|
;
|
|
|
|
protected
|
|
UPLETTER
|
|
: 'A'..'Z'
|
|
;
|
|
|
|
protected
|
|
LETTER
|
|
: LCLETTER
|
|
| UPLETTER
|
|
;
|
|
|
|
protected
|
|
UNDEFINED_TOKEN
|
|
: '<' (~'>')* '>'
|
|
(
|
|
( /* the usual newline hassle: \r\n can be matched in alt 1
|
|
* or by matching alt 2 followed by alt 3 in another iteration.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: "\r\n" | '\r' | '\n'
|
|
)
|
|
{ [self newline];}
|
|
)*
|
|
{NSLog(@"invalid tag: %@",[self text]);}
|
|
| ( "\r\n" | '\r' | '\n' ) {[self newline];}
|
|
| .
|
|
;
|
|
|
|
/*
|
|
: ('<' { NSLog(@"Warning: non-standard tag <%c",(char)[self LA:1]); } )
|
|
(~'>' { NSLog(@"%c",(char)[self LA:1]);} )*
|
|
('>' { NSLog(@" skipped."); } )
|
|
{ _ttype = ANTLRToken_SKIP; }
|
|
;
|
|
*/
|
|
|