libs-gsweb/GSWeb.framework/html.g.complete
Manuel Guesdon a0b34ee1ee Initial revision
git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/gsweb/trunk@5815 72102866-910b-0410-8b05-ffd578937521
2000-01-22 12:49:49 +00:00

1070 lines
14 KiB
Text

/*
Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
Alexander Hinds & Terence Parr
Magelang Institute, Ltd.
Send comments to: parrt@parr-research.com
v1.0 Terence John Parr (version 2.5.0 of ANTLR required)
Fixed how whitespace as handled, removing some ambiguities; some
because of ANTLR lexical filtering in 2.5.0.
Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches
everything between valid tags (how could there be more than one
between tags?)
Made the DOCTYPE optional.
Reduced lookahead from k=5 to k=1 on the parser and number
of parser ambiguities to 2. Reduced lexer lookahead from 6
to 4; had to left factor a bunch of stuff.
List items couldn't contain nested lists...fixed it.
Fixed def of WORD so it can't be an INT. Removed '-' from WORD.
Fixed HEXNUM so it will allow letters A..F.
KNOWN ISSUES:
1. Does not handle "staggered" tags, eg: <p> <i> <p> <i>
2. Adhere's somewhat strictly to the html spec, so many pages
won't parse without errors.
3. Doesn't convert &(a signifier) to it's proper single char
representation
4. Checks only the syntax of element attributes, not the semantics,
e.g. won't very that a base element's attribute is actually
called "href"
5. Tags split across lines, for example, <A (NEWLINE) some text >
won't be properly recognized. TJP: I think I fixed this.
7. Lines not counted properly due to the def'n of PCDATA - see the
alternate def'n for a possible fix. TJP: I think I fixed this.
*/
header {
#include "GSWeb.h"
}
options {
language="Objc";
}
class WOHTMLParser extends Parser;
options {
tokenVocabulary=WOHTML;
buildAST=true;
k = 1;
}
document
: (PCDATA)? (DOCTYPE (PCDATA)?)?
(OHTML^ (PCDATA)?)?
(head)?
(body)?
(CHTML! (PCDATA)?)?
;
head: (OHEAD^ (PCDATA)?)?
head_element
(PCDATA | head_element)*
(CHEAD! (PCDATA)?)?
;
head_element
: title //bug need at least a title, rest optional
| script
| style
| ISINDEX
| BASE
| META
| LINK
| webobject
;
title
: OTITLE^ (PCDATA)? CTITLE!
;
script
: OSCRIPT^ (~CSCRIPT)+ CSCRIPT!
;
style
: OSTYLE^ (~CSTYLE)+ CSTYLE!
;
body: ( OBODY^ (PCDATA)? )?
body_content_no_PCDATA
( body_content )+
( CBODY! (PCDATA)? )?
;
body_content_no_PCDATA
: body_tag | text_tag
;
body_tag
: heading | block | ADDRESS
;
body_content
: body_tag | text
;
/*revised*/
heading
: h1 | h2 | h3 | h4 | h5 | h6
;
block
: paragraph | list | preformatted | div |
center | blockquote | HR | table | webobject
; //bug - ?FORM v %form, ISINDEX here too?
font: teletype | italic | bold | underline | strike |
big | small | subscript | superscript | webobject
;
phrase
: emphasize | strong | definition | code | sample_output|
keyboard_text | variable | citation | webobject
;
special
: anchor | IMG | applet | font_dfn | BFONT |
map | BR
;
text_tag
: font | phrase | special | form | webobject
;
text: PCDATA | text_tag | webobject
;
/*end*/
/*BLOCK ELEMENTS*/
h1 : OH1^ (block | text)* CH1!
;
h2 : OH2^ (block | text)* CH2!
;
h3 : OH3^ (block | text)* CH3!
;
h4 : OH4^ (block | text)* CH4!
;
h5 : OH5^ (block | text)* CH5!
;
h6 : OH6^ (block | text)* CH6!
;
address
: OADDRESS (PCDATA)? CADDRESS
;
//NOTE: according to the standard, paragraphs can't contain block elements
//like HR. Netscape may insert these elements into paragraphs.
//We adhere strictly here.
paragraph
: OPARA^
(
/* Rule body_content may also be just plain text because HTML is
so loose. When body puts body_content in a loop, ANTLR
doesn't know whether you want it to match all the text as part
of this paragraph (in the case where the </p> is missing) or
if the body rule should scarf it. This is analogous to the
dangling-else clause. I shut off the warning.
*/
options {
generateAmbigWarnings=false;
}
: text
)*
(CPARA)?!
;
list: unordered_list
| ordered_list
| def_list
;
unordered_list
: OULIST^ (PCDATA)? (list_item)+ CULIST!
;
ordered_list
: OOLIST^ (PCDATA)? (list_item)+ COLIST!
;
def_list
: ODLIST^ (PCDATA)? (def_list_item)+ CDLIST!
;
list_item
: OLITEM^ ( text | list )+ (CLITEM! (PCDATA)?)?
;
def_list_item
: dt | dd
;
dt : ODTERM^ (text)+ CDTERM! (PCDATA)?
;
dd : ODDEF^ (text | block)+ CDTERM! (PCDATA)?
;
dir : ODIR^ (list_item)+ CDIR!
;
menu: OMENU^ (list_item)+ CMENU!
;
preformatted
: OPRE^ (text)+ CPRE!
;
div : ODIV^ (body_content)* CDIV! //semi-revised
;
center
: OCENTER^ (body_content)* CCENTER! //semi-revised
;
webobject
: OWEBOBJECT^ (body_content)* CWEBOBJECT!
;
blockquote
: OBQUOTE^ PCDATA CBQUOTE!
;
form: OFORM^ (form_field | body_content)* CFORM!
;
table
: OTABLE^ (caption)? (PCDATA)? (tr)+ CTABLE!
;
caption
: OCAP^ (text)* CCAP!
;
tr : O_TR^ (PCDATA)? (th_or_td)* (C_TR! (PCDATA)?)?
;
th_or_td
: O_TH_OR_TD^ (body_content)* (C_TH_OR_TD! (PCDATA)?)?
;
/*TEXT ELEMENTS*/
/*font style*/
teletype
: OTTYPE^ ( text )+ CTTYPE!
;
italic
: OITALIC^ ( text )+ CITALIC!
;
bold: OBOLD^ ( text )+ CBOLD!
;
underline
: OUNDER^ ( text )+ CUNDER!
;
strike
: OSTRIKE^ ( text )+ CSTRIKE!
;
big : OBIG^ ( text )+ CBIG!
;
small
: OSMALL^ ( text )+ CSMALL!
;
subscript
: OSUB^ ( text )+ CSUB!
;
superscript
: OSUP^ ( text )+ CSUP!
;
/*phrase elements*/
emphasize
: OEM^ ( text )+ CEM!
;
strong
: OSTRONG^ ( text )+ CSTRONG!
;
definition
: ODEF^ ( text )+ CDEF!
;
code
: OCODE^ ( text )+ CCODE!
;
sample_output
: OSAMP^ ( text )+ CSAMP!
;
keyboard_text
: OKBD^ ( text )+ CKBD!
;
variable
: OVAR^ ( text )+ CVAR!
;
citation
: OCITE^ ( text )+ CCITE!
;
/* form fields (combined with body_content elsewhere so no PCDATA on end) */
form_field
: INPUT | select | textarea
;
select
: OSELECT^ (PCDATA)? (select_option)+ CSELECT!
;
select_option
: SELOPT (PCDATA)?
;
textarea
: OTAREA^ (PCDATA)? CTAREA!
;
/* special text level elements*/
anchor
: OANCHOR^ (text)* CANCHOR!
;
applet
: OAPPLET^ (APARAM)? (PCDATA)? CAPPLET!
;
//not w3-no blocks allowed; www.microsoft.com uses
font_dfn
: OFONT^ (text)* CFONT!
;
map : OMAP^ (AREA)+ CMAP!
;
class WOHTMLLexer extends Lexer;
options {
k = 4;
tokenVocabulary=WOHTML;
charVocabulary = '\3'..'\377';
caseSensitive=false;
filter=UNDEFINED_TOKEN;
}
/* STRUCTURAL tags
*/
DOCTYPE
options {
ignore=WS;
}
: "<!doctype" "html" "public" STRING '>'
;
OHTML
: "<html>"
;
CHTML
: "</html>"
;
OHEAD
: "<head>"
;
CHEAD
: "</head>"
;
OBODY
: "<body" (WS (ATTR )*)? '>'
;
CBODY
: "</body>"
;
/* HEAD ELEMENTS
*/
OTITLE
: "<title>"
;
CTITLE
: "</title>"
;
OSCRIPT
: "<script>"
;
CSCRIPT
: "</script>"
;
ISINDEX
: "<isindex" WS ATTR '>'
;
META
: "<meta" WS (ATTR)+ '>'
;
LINK
: "<link" WS (ATTR)+ '>'
;
/* headings */
OH1 : "<h1" (WS ATTR)? '>'
;
CH1 : "</h1>"
;
OH2 : "<h2" (WS ATTR)?'>'
;
CH2 : "</h2>"
;
OH3 : "<h3" (WS ATTR)? '>'
;
CH3 : "</h3>"
;
OH4 : "<h4" (WS ATTR)? '>'
;
CH4 : "</h4>"
;
OH5 : "<h5" (WS ATTR)? '>'
;
CH5 : "</h5>"
;
OH6 : "<h6" (WS ATTR)? '>'
;
CH6 : "</h6>"
;
OADDRESS
: "<address>"
;
CADDRESS
: "</address>"
;
OPARA
: "<p" (WS ATTR)? '>'
;
CPARA
: "</p>" //it's optional
;
/*UNORDERED LIST*/
OULIST
: "<ul" (WS ATTR)? '>'
;
CULIST
: "</ul>"
;
/*ORDERED LIST*/
OOLIST
: "<ol" (WS ATTR)? '>'
;
COLIST
: "</ol>"
;
/*LIST ITEM*/
OLITEM
: "<li" (WS ATTR)? '>'
;
CLITEM
: "</li>"
;
/*DEFINITION LIST*/
ODLIST
: "<dl" (WS ATTR)? '>'
;
CDLIST
: "</dl>"
;
ODTERM
: "<dt>"
;
CDTERM
: "</dt>"
;
ODDEF
: "<dd>"
;
CDDEF
: "</dd>"
;
ODIR: "<dir>"
;
CDIR_OR_CDIV
: "</di"
( 'r' {$setType(WOHTMLTokenType_CDIR);}
| 'v' {$setType(WOHTMLTokenType_CDIV);}
)
'>'
;
ODIV: "<div" (WS ATTR)? '>'
;
OMENU
: "<menu>"
;
CMENU
: "</menu>"
;
OPRE: ("<pre>" | "<xmp>") ('\n')?
;
CPRE: "</pre>" | "</xmp>"
;
OCENTER
: "<center>"
;
CCENTER
: "</center>"
;
OWEBOBJECT
: "<webobject" (WS (ATTR)*)? '>'
;
CWEBOBJECT
: "</webobject>"
;
OBQUOTE
: "<blockquote>"
;
CBQUOTE
: "</blockquote>"
;
//this is block element and thus can't be nested inside of
//other block elements, ex: paragraphs.
//Netscape appears to generate bad HTML vis-a-vis the standard.
HR : "<hr" (WS (ATTR)*)? '>'
;
OTABLE
: "<table" (WS (ATTR)*)? '>'
;
CTABLE
: "</table>"
;
OCAP: "<caption" (WS (ATTR)*)? '>'
;
CCAP: "</caption>"
;
O_TR
: "<tr" (WS (ATTR)*)? '>'
;
C_TR: "</tr>"
;
O_TH_OR_TD
: ("<th" | "<td") (WS (ATTR)*)? '>'
;
C_TH_OR_TD
: "</th>" | "</td>"
;
/* PCDATA-LEVEL ELEMENTS
*/
/* font style elemens*/
OTTYPE
: "<tt>"
;
CTTYPE
: "</tt>"
;
OITALIC
: "<i>"
;
CITALIC
: "</i>"
;
OBOLD
: "<b>"
;
CBOLD
: "</b>"
;
OUNDER
: "<u>"
;
CUNDER
: "</u>"
;
/** Left-factor <strike> and <strong> to reduce lookahead */
OSTRIKE_OR_OSTRONG
: "<str"
( "ike" {$setType(WOHTMLTokenType_OSTRIKE);}
| "ong" {$setType(WOHTMLTokenType_OSTRONG);}
)
'>'
;
CST_LEFT_FACTORED
: "</st"
( "rike" {$setType(WOHTMLTokenType_CSTRIKE);}
| "rong" {$setType(WOHTMLTokenType_CSTRONG);}
| "yle" {$setType(WOHTMLTokenType_CSTYLE);}
)
'>'
;
OSTYLE
: "<style>"
;
OBIG: "<big>"
;
CBIG: "</big>"
;
OSMALL
: "<small>"
;
CSMALL
: "</small>"
;
OSUB: "<sub>"
;
OSUP: "<sup>"
;
CSUB_OR_CSUP
: "</su"
( 'b' {$setType(WOHTMLTokenType_CSUB);}
| 'p' {$setType(WOHTMLTokenType_CSUP);}
)
'>'
;
/* phrase elements*/
OEM : "<em>"
;
CEM : "</em>"
;
ODFN: "<dfn>"
;
CDFN: "</dfn>"
;
OCODE
: "<code>"
;
CCODE
: "</code>"
;
OSAMP
: "<samp>"
;
CSAMP
: "</samp>"
;
OKBD: "<kbd>"
;
CKBD: "</kbd>"
;
OVAR: "<var>"
;
CVAR: "</var>"
;
OCITE
: "<cite>"
;
CCYTE
: "</cite>"
;
/* form fields*/
INPUT
: "<input" (WS (ATTR)*)? '>'
;
OSELECT
: "<select" (WS (ATTR)*)? '>'
;
CSELECT
: "</select>"
;
OTAREA
: "<textarea" (WS (ATTR)*)? '>'
;
CTAREA
: "</textarea>"
;
SELOPT
: "<option" (WS (ATTR)*)? '>'
;
/* special text level elements*/
OANCHOR
: "<a" WS (ATTR)+ '>'
;
CANCHOR
: "</a>"
;
IMG : "<img" WS (ATTR)+ '>'
;
OAPPLET
: "<applet" WS (ATTR)+ '>'
;
APPLET
: "</applet>"
;
APARM
: "<param" WS (ATTR)+'>'
;
OFORM
: "<form" WS (ATTR)+ '>'
;
OFONT
: "<font" WS (ATTR)+ '>'
;
CFORM_OR_CFONT
: "</fo"
( "rm" {$setType(WOHTMLTokenType_CFORM);}
| "nt" {$setType(WOHTMLTokenType_CFONT);}
)
'>'
;
/*
CFORM
: "</form>"
;
CFONT
: "</font>"
;
*/
BFONT_OR_BASE
: "<base"
( "font" WS ATTR {$setType(WOHTMLTokenType_BFONT);}
| WS ATTR {$setType(WOHTMLTokenType_BASE);}
)
'>'
;
/*
BFONT
: "<basefont" WS ATTR '>'
;
BASE: "<base" WS ATTR '>'
;
*/
BR
: "<br" (WS ATTR)? '>'
;
OMAP
: "<map" WS ATTR '>'
;
CMAP: "</map>"
;
AREA: "<area" WS (ATTR)+ '>'
;
/*MISC STUFF*/
PCDATA
: (
/* See comment in WS. Language for combining any flavor
* newline is ambiguous. Shutting off the warning.
*/
options {
generateAmbigWarnings=false;
}
: '\r' '\n' {[self newline];}
| '\r' {[self newline];}
| '\n' {[self newline];}
| ~('<'|'\n'|'\r'|'"'|'>')
)+
;
protected
COMMENT_DATA
: (~('<' | '!' | '>'))+
;
COMMENT
: "<!--" (COMMENT_DATA)? "-->" { _ttype = ANTLRToken_SKIP; }
;
/*
PROTECTED LEXER RULES
*/
protected
WS : (
/* '\r' '\n' can be matched in one alternative or by matching
'\r' in one iteration and '\n' in another. I am trying to
handle any flavor of newline that comes in, but the language
that allows both "\r\n" and "\r" and "\n" to all be valid
newline is ambiguous. Consequently, the resulting grammar
must be ambiguous. I'm shutting this warning off.
*/
options {
generateAmbigWarnings=false;
}
: ' '
| '\t'
| '\n' { [self newline]; }
| "\r\n" { [self newline]; }
| '\r' { [self newline]; }
)+
;
protected
ATTR
options {
ignore=WS;
}
: WORD ('=' (WORD ('%')? | ('-')? INT | STRING | HEXNUM))?
;
//don't need uppercase for case-insen.
//the '.' is for words like "image.gif"
protected
WORD: ( LCLETTER
| '.'
)
(
/* In reality, a WORD must be followed by whitespace, '=', or
what can follow an ATTR such as '>'. In writing this grammar,
however, we just list all the possibilities as optional
elements. This is loose, allowing the case where nothing is
matched after a WORD and then the (ATTR)* loop means the
grammar would allow "widthheight" as WORD WORD or WORD, hence,
an ambiguity. Naturally, ANTLR will consume the input as soon
as possible, combing "widthheight" into one WORD.
I am shutting off the ambiguity here because ANTLR does the
right thing. The exit path is ambiguous with ever
alternative. The only solution would be to write an unnatural
grammar (lots of extra productions) that laid out the
possibilities explicitly, preventing the bogus WORD followed
immediately by WORD without whitespace etc...
*/
options {
generateAmbigWarnings=false;
}
: LCLETTER
| DIGIT
| '.'
)+
;
protected
STRING
: '"' (~'"')* '"'
| '\'' (~'\'')* '\''
;
protected
WSCHARS
: ' ' | '\t' | '\n' | '\r'
;
protected
SPECIAL
: '<' | '~'
;
protected
HEXNUM
: '#' HEXINT
;
protected
INT : (DIGIT)+
;
protected
HEXINT
: (
/* Technically, HEXINT cannot be followed by a..f, but due to our
loose grammar, the whitespace that normally would follow this
rule is optional. ANTLR reports that #4FACE could parse as
HEXINT "#4" followed by WORD "FACE", which is clearly bogus.
ANTLR does the right thing by consuming a much input as
possible here. I shut the warning off.
*/
options {
generateAmbigWarnings=false;
}
: HEXDIGIT
)+
;
protected
DIGIT
: '0'..'9'
;
protected
HEXDIGIT
: '0'..'9'
| 'a'..'f'
;
protected
LCLETTER
: 'a'..'z'
;
protected
UNDEFINED_TOKEN
: '<' (~'>')* '>'
(
( /* the usual newline hassle: \r\n can be matched in alt 1
* or by matching alt 2 followed by alt 3 in another iteration.
*/
options {
generateAmbigWarnings=false;
}
: "\r\n" | '\r' | '\n'
)
{ [self newline];}
)*
{NSLog(@"invalid tag: %@",[self text]);}
| ( "\r\n" | '\r' | '\n' ) {[self newline];}
| .
;
/*
: ('<' { NSLog(@"Warning: non-standard tag <%c",(char)[self LA:1]); } )
(~'>' { NSLog(@"%c",(char)[self LA:1]);} )*
('>' { NSLog(@" skipped."); } )
{ _ttype = ANTLRToken_SKIP; }
;
*/