/* Based on the HTML 3.2 spec. by the W3 (http://www.w3.org) Alexander Hinds & Terence Parr Magelang Institute, Ltd. Send comments to: parrt@parr-research.com v1.0 Terence John Parr (version 2.5.0 of ANTLR required) Fixed how whitespace as handled, removing some ambiguities; some because of ANTLR lexical filtering in 2.5.0. Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches everything between valid tags (how could there be more than one between tags?) Made the DOCTYPE optional. Reduced lookahead from k=5 to k=1 on the parser and number of parser ambiguities to 2. Reduced lexer lookahead from 6 to 4; had to left factor a bunch of stuff. List items couldn't contain nested lists...fixed it. Fixed def of WORD so it can't be an INT. Removed '-' from WORD. Fixed HEXNUM so it will allow letters A..F. KNOWN ISSUES: 1. Does not handle "staggered" tags, eg:

2. Adhere's somewhat strictly to the html spec, so many pages won't parse without errors. 3. Doesn't convert &(a signifier) to it's proper single char representation 4. Checks only the syntax of element attributes, not the semantics, e.g. won't very that a base element's attribute is actually called "href" 5. Tags split across lines, for example, won't be properly recognized. TJP: I think I fixed this. 7. Lines not counted properly due to the def'n of PCDATA - see the alternate def'n for a possible fix. TJP: I think I fixed this. */ header { #include "GSWeb.h" } options { language="Objc"; } class WOHTMLParser extends Parser; options { tokenVocabulary=WOHTML; buildAST=true; k = 1; } document : (PCDATA)? (DOCTYPE (PCDATA)?)? (OHTML^ (PCDATA)?)? (head)? (body)? (CHTML! (PCDATA)?)? ; head: (OHEAD^ (PCDATA)?)? head_element (PCDATA | head_element)* (CHEAD! (PCDATA)?)? ; head_element : title //bug need at least a title, rest optional | script | style | ISINDEX | BASE | META | LINK | webobject ; title : OTITLE^ (PCDATA)? CTITLE! ; script : OSCRIPT^ (~CSCRIPT)+ CSCRIPT! ; style : OSTYLE^ (~CSTYLE)+ CSTYLE! ; body: ( OBODY^ (PCDATA)? )? body_content_no_PCDATA ( body_content )+ ( CBODY! (PCDATA)? )? ; body_content_no_PCDATA : body_tag | text_tag ; body_tag : heading | block | ADDRESS ; body_content : body_tag | text ; /*revised*/ heading : h1 | h2 | h3 | h4 | h5 | h6 ; block : paragraph | list | preformatted | div | center | blockquote | HR | table | webobject ; //bug - ?FORM v %form, ISINDEX here too? font: teletype | italic | bold | underline | strike | big | small | subscript | superscript | webobject ; phrase : emphasize | strong | definition | code | sample_output| keyboard_text | variable | citation | webobject ; special : anchor | IMG | applet | font_dfn | BFONT | map | BR ; text_tag : font | phrase | special | form | webobject ; text: PCDATA | text_tag | webobject ; /*end*/ /*BLOCK ELEMENTS*/ h1 : OH1^ (block | text)* CH1! ; h2 : OH2^ (block | text)* CH2! ; h3 : OH3^ (block | text)* CH3! ; h4 : OH4^ (block | text)* CH4! ; h5 : OH5^ (block | text)* CH5! ; h6 : OH6^ (block | text)* CH6! ; address : OADDRESS (PCDATA)? CADDRESS ; //NOTE: according to the standard, paragraphs can't contain block elements //like HR. Netscape may insert these elements into paragraphs. //We adhere strictly here. paragraph : OPARA^ ( /* Rule body_content may also be just plain text because HTML is so loose. When body puts body_content in a loop, ANTLR doesn't know whether you want it to match all the text as part of this paragraph (in the case where the

is missing) or if the body rule should scarf it. This is analogous to the dangling-else clause. I shut off the warning. */ options { generateAmbigWarnings=false; } : text )* (CPARA)?! ; list: unordered_list | ordered_list | def_list ; unordered_list : OULIST^ (PCDATA)? (list_item)+ CULIST! ; ordered_list : OOLIST^ (PCDATA)? (list_item)+ COLIST! ; def_list : ODLIST^ (PCDATA)? (def_list_item)+ CDLIST! ; list_item : OLITEM^ ( text | list )+ (CLITEM! (PCDATA)?)? ; def_list_item : dt | dd ; dt : ODTERM^ (text)+ CDTERM! (PCDATA)? ; dd : ODDEF^ (text | block)+ CDTERM! (PCDATA)? ; dir : ODIR^ (list_item)+ CDIR! ; menu: OMENU^ (list_item)+ CMENU! ; preformatted : OPRE^ (text)+ CPRE! ; div : ODIV^ (body_content)* CDIV! //semi-revised ; center : OCENTER^ (body_content)* CCENTER! //semi-revised ; webobject : OWEBOBJECT^ (body_content)* CWEBOBJECT! ; blockquote : OBQUOTE^ PCDATA CBQUOTE! ; form: OFORM^ (form_field | body_content)* CFORM! ; table : OTABLE^ (caption)? (PCDATA)? (tr)+ CTABLE! ; caption : OCAP^ (text)* CCAP! ; tr : O_TR^ (PCDATA)? (th_or_td)* (C_TR! (PCDATA)?)? ; th_or_td : O_TH_OR_TD^ (body_content)* (C_TH_OR_TD! (PCDATA)?)? ; /*TEXT ELEMENTS*/ /*font style*/ teletype : OTTYPE^ ( text )+ CTTYPE! ; italic : OITALIC^ ( text )+ CITALIC! ; bold: OBOLD^ ( text )+ CBOLD! ; underline : OUNDER^ ( text )+ CUNDER! ; strike : OSTRIKE^ ( text )+ CSTRIKE! ; big : OBIG^ ( text )+ CBIG! ; small : OSMALL^ ( text )+ CSMALL! ; subscript : OSUB^ ( text )+ CSUB! ; superscript : OSUP^ ( text )+ CSUP! ; /*phrase elements*/ emphasize : OEM^ ( text )+ CEM! ; strong : OSTRONG^ ( text )+ CSTRONG! ; definition : ODEF^ ( text )+ CDEF! ; code : OCODE^ ( text )+ CCODE! ; sample_output : OSAMP^ ( text )+ CSAMP! ; keyboard_text : OKBD^ ( text )+ CKBD! ; variable : OVAR^ ( text )+ CVAR! ; citation : OCITE^ ( text )+ CCITE! ; /* form fields (combined with body_content elsewhere so no PCDATA on end) */ form_field : INPUT | select | textarea ; select : OSELECT^ (PCDATA)? (select_option)+ CSELECT! ; select_option : SELOPT (PCDATA)? ; textarea : OTAREA^ (PCDATA)? CTAREA! ; /* special text level elements*/ anchor : OANCHOR^ (text)* CANCHOR! ; applet : OAPPLET^ (APARAM)? (PCDATA)? CAPPLET! ; //not w3-no blocks allowed; www.microsoft.com uses font_dfn : OFONT^ (text)* CFONT! ; map : OMAP^ (AREA)+ CMAP! ; class WOHTMLLexer extends Lexer; options { k = 4; tokenVocabulary=WOHTML; charVocabulary = '\3'..'\377'; caseSensitive=false; filter=UNDEFINED_TOKEN; } /* STRUCTURAL tags */ DOCTYPE options { ignore=WS; } : "' ; OHTML : "" ; CHTML : "" ; OHEAD : "" ; CHEAD : "" ; OBODY : "' ; CBODY : "" ; /* HEAD ELEMENTS */ OTITLE : "" ; CTITLE : "" ; OSCRIPT : "" ; ISINDEX : "' ; META : "' ; LINK : "' ; /* headings */ OH1 : "' ; CH1 : "" ; OH2 : "' ; CH2 : "" ; OH3 : "' ; CH3 : "" ; OH4 : "' ; CH4 : "" ; OH5 : "' ; CH5 : "" ; OH6 : "' ; CH6 : "" ; OADDRESS : "
" ; CADDRESS : "
" ; OPARA : "' ; CPARA : "

" //it's optional ; /*UNORDERED LIST*/ OULIST : "' ; CULIST : "" ; /*ORDERED LIST*/ OOLIST : "' ; COLIST : "" ; /*LIST ITEM*/ OLITEM : "' ; CLITEM : "" ; /*DEFINITION LIST*/ ODLIST : "' ; CDLIST : "" ; ODTERM : "
" ; CDTERM : "
" ; ODDEF : "
" ; CDDEF : "
" ; ODIR: "" ; CDIR_OR_CDIV : "' ; ODIV: "' ; OMENU : "" ; CMENU : "" ; OPRE: ("
" | "") ('\n')? 
	;

CPRE:	 "</pre>" | "" 
	;

OCENTER
	:	"
" ; CCENTER : "
" ; OWEBOBJECT : "' ; CWEBOBJECT : "" ; OBQUOTE : "
" ; CBQUOTE : "
" ; //this is block element and thus can't be nested inside of //other block elements, ex: paragraphs. //Netscape appears to generate bad HTML vis-a-vis the standard. HR : "' ; OTABLE : "' ; CTABLE : "" ; OCAP: "' ; CCAP: "" ; O_TR : "' ; C_TR: "" ; O_TH_OR_TD : ("' ; C_TH_OR_TD : "" | "" ; /* PCDATA-LEVEL ELEMENTS */ /* font style elemens*/ OTTYPE : "" ; CTTYPE : "" ; OITALIC : "" ; CITALIC : "" ; OBOLD : "" ; CBOLD : "" ; OUNDER : "" ; CUNDER : "" ; /** Left-factor and to reduce lookahead */ OSTRIKE_OR_OSTRONG : "' ; CST_LEFT_FACTORED : "' ; OSTYLE : "