mirror of
https://github.com/gnustep/libs-gsweb.git
synced 2025-02-22 19:21:23 +00:00
git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/gsweb/trunk@5815 72102866-910b-0410-8b05-ffd578937521
1070 lines
14 KiB
Text
1070 lines
14 KiB
Text
/*
|
|
Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
|
|
Alexander Hinds & Terence Parr
|
|
Magelang Institute, Ltd.
|
|
Send comments to: parrt@parr-research.com
|
|
|
|
v1.0 Terence John Parr (version 2.5.0 of ANTLR required)
|
|
|
|
Fixed how whitespace as handled, removing some ambiguities; some
|
|
because of ANTLR lexical filtering in 2.5.0.
|
|
|
|
Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches
|
|
everything between valid tags (how could there be more than one
|
|
between tags?)
|
|
|
|
Made the DOCTYPE optional.
|
|
|
|
Reduced lookahead from k=5 to k=1 on the parser and number
|
|
of parser ambiguities to 2. Reduced lexer lookahead from 6
|
|
to 4; had to left factor a bunch of stuff.
|
|
|
|
List items couldn't contain nested lists...fixed it.
|
|
|
|
Fixed def of WORD so it can't be an INT. Removed '-' from WORD.
|
|
|
|
Fixed HEXNUM so it will allow letters A..F.
|
|
|
|
KNOWN ISSUES:
|
|
|
|
1. Does not handle "staggered" tags, eg: <p> <i> <p> <i>
|
|
|
|
2. Adhere's somewhat strictly to the html spec, so many pages
|
|
won't parse without errors.
|
|
|
|
3. Doesn't convert &(a signifier) to it's proper single char
|
|
representation
|
|
|
|
4. Checks only the syntax of element attributes, not the semantics,
|
|
e.g. won't very that a base element's attribute is actually
|
|
called "href"
|
|
|
|
5. Tags split across lines, for example, <A (NEWLINE) some text >
|
|
won't be properly recognized. TJP: I think I fixed this.
|
|
|
|
7. Lines not counted properly due to the def'n of PCDATA - see the
|
|
alternate def'n for a possible fix. TJP: I think I fixed this.
|
|
|
|
*/
|
|
header {
|
|
#include "GSWeb.h"
|
|
}
|
|
|
|
options {
|
|
language="Objc";
|
|
}
|
|
|
|
class WOHTMLParser extends Parser;
|
|
options {
|
|
tokenVocabulary=WOHTML;
|
|
buildAST=true;
|
|
k = 1;
|
|
}
|
|
|
|
|
|
document
|
|
: (PCDATA)? (DOCTYPE (PCDATA)?)?
|
|
(OHTML^ (PCDATA)?)?
|
|
(head)?
|
|
(body)?
|
|
(CHTML! (PCDATA)?)?
|
|
;
|
|
|
|
head: (OHEAD^ (PCDATA)?)?
|
|
head_element
|
|
(PCDATA | head_element)*
|
|
(CHEAD! (PCDATA)?)?
|
|
;
|
|
|
|
head_element
|
|
: title //bug need at least a title, rest optional
|
|
| script
|
|
| style
|
|
| ISINDEX
|
|
| BASE
|
|
| META
|
|
| LINK
|
|
| webobject
|
|
;
|
|
|
|
title
|
|
: OTITLE^ (PCDATA)? CTITLE!
|
|
;
|
|
|
|
script
|
|
: OSCRIPT^ (~CSCRIPT)+ CSCRIPT!
|
|
;
|
|
|
|
style
|
|
: OSTYLE^ (~CSTYLE)+ CSTYLE!
|
|
;
|
|
|
|
body: ( OBODY^ (PCDATA)? )?
|
|
body_content_no_PCDATA
|
|
( body_content )+
|
|
( CBODY! (PCDATA)? )?
|
|
;
|
|
|
|
body_content_no_PCDATA
|
|
: body_tag | text_tag
|
|
;
|
|
|
|
body_tag
|
|
: heading | block | ADDRESS
|
|
;
|
|
|
|
body_content
|
|
: body_tag | text
|
|
;
|
|
|
|
|
|
/*revised*/
|
|
heading
|
|
: h1 | h2 | h3 | h4 | h5 | h6
|
|
;
|
|
|
|
block
|
|
: paragraph | list | preformatted | div |
|
|
center | blockquote | HR | table | webobject
|
|
; //bug - ?FORM v %form, ISINDEX here too?
|
|
|
|
font: teletype | italic | bold | underline | strike |
|
|
big | small | subscript | superscript | webobject
|
|
;
|
|
|
|
phrase
|
|
: emphasize | strong | definition | code | sample_output|
|
|
keyboard_text | variable | citation | webobject
|
|
;
|
|
|
|
special
|
|
: anchor | IMG | applet | font_dfn | BFONT |
|
|
map | BR
|
|
;
|
|
|
|
text_tag
|
|
: font | phrase | special | form | webobject
|
|
;
|
|
|
|
text: PCDATA | text_tag | webobject
|
|
;
|
|
|
|
/*end*/
|
|
|
|
|
|
/*BLOCK ELEMENTS*/
|
|
|
|
h1 : OH1^ (block | text)* CH1!
|
|
;
|
|
h2 : OH2^ (block | text)* CH2!
|
|
;
|
|
h3 : OH3^ (block | text)* CH3!
|
|
;
|
|
h4 : OH4^ (block | text)* CH4!
|
|
;
|
|
h5 : OH5^ (block | text)* CH5!
|
|
;
|
|
h6 : OH6^ (block | text)* CH6!
|
|
;
|
|
|
|
address
|
|
: OADDRESS (PCDATA)? CADDRESS
|
|
;
|
|
|
|
//NOTE: according to the standard, paragraphs can't contain block elements
|
|
//like HR. Netscape may insert these elements into paragraphs.
|
|
//We adhere strictly here.
|
|
|
|
paragraph
|
|
: OPARA^
|
|
(
|
|
/* Rule body_content may also be just plain text because HTML is
|
|
so loose. When body puts body_content in a loop, ANTLR
|
|
doesn't know whether you want it to match all the text as part
|
|
of this paragraph (in the case where the </p> is missing) or
|
|
if the body rule should scarf it. This is analogous to the
|
|
dangling-else clause. I shut off the warning.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: text
|
|
)*
|
|
(CPARA)?!
|
|
;
|
|
|
|
list: unordered_list
|
|
| ordered_list
|
|
| def_list
|
|
;
|
|
|
|
unordered_list
|
|
: OULIST^ (PCDATA)? (list_item)+ CULIST!
|
|
;
|
|
|
|
ordered_list
|
|
: OOLIST^ (PCDATA)? (list_item)+ COLIST!
|
|
;
|
|
|
|
def_list
|
|
: ODLIST^ (PCDATA)? (def_list_item)+ CDLIST!
|
|
;
|
|
|
|
list_item
|
|
: OLITEM^ ( text | list )+ (CLITEM! (PCDATA)?)?
|
|
;
|
|
|
|
def_list_item
|
|
: dt | dd
|
|
;
|
|
|
|
dt : ODTERM^ (text)+ CDTERM! (PCDATA)?
|
|
;
|
|
|
|
dd : ODDEF^ (text | block)+ CDTERM! (PCDATA)?
|
|
;
|
|
|
|
dir : ODIR^ (list_item)+ CDIR!
|
|
;
|
|
|
|
menu: OMENU^ (list_item)+ CMENU!
|
|
;
|
|
|
|
preformatted
|
|
: OPRE^ (text)+ CPRE!
|
|
;
|
|
|
|
div : ODIV^ (body_content)* CDIV! //semi-revised
|
|
;
|
|
|
|
center
|
|
: OCENTER^ (body_content)* CCENTER! //semi-revised
|
|
;
|
|
|
|
webobject
|
|
: OWEBOBJECT^ (body_content)* CWEBOBJECT!
|
|
;
|
|
|
|
blockquote
|
|
: OBQUOTE^ PCDATA CBQUOTE!
|
|
;
|
|
|
|
form: OFORM^ (form_field | body_content)* CFORM!
|
|
;
|
|
|
|
table
|
|
: OTABLE^ (caption)? (PCDATA)? (tr)+ CTABLE!
|
|
;
|
|
|
|
caption
|
|
: OCAP^ (text)* CCAP!
|
|
;
|
|
|
|
tr : O_TR^ (PCDATA)? (th_or_td)* (C_TR! (PCDATA)?)?
|
|
;
|
|
|
|
th_or_td
|
|
: O_TH_OR_TD^ (body_content)* (C_TH_OR_TD! (PCDATA)?)?
|
|
;
|
|
|
|
/*TEXT ELEMENTS*/
|
|
|
|
/*font style*/
|
|
|
|
teletype
|
|
: OTTYPE^ ( text )+ CTTYPE!
|
|
;
|
|
|
|
italic
|
|
: OITALIC^ ( text )+ CITALIC!
|
|
;
|
|
|
|
bold: OBOLD^ ( text )+ CBOLD!
|
|
;
|
|
|
|
underline
|
|
: OUNDER^ ( text )+ CUNDER!
|
|
;
|
|
|
|
strike
|
|
: OSTRIKE^ ( text )+ CSTRIKE!
|
|
;
|
|
|
|
big : OBIG^ ( text )+ CBIG!
|
|
;
|
|
|
|
small
|
|
: OSMALL^ ( text )+ CSMALL!
|
|
;
|
|
|
|
subscript
|
|
: OSUB^ ( text )+ CSUB!
|
|
;
|
|
|
|
superscript
|
|
: OSUP^ ( text )+ CSUP!
|
|
;
|
|
|
|
/*phrase elements*/
|
|
|
|
emphasize
|
|
: OEM^ ( text )+ CEM!
|
|
;
|
|
|
|
strong
|
|
: OSTRONG^ ( text )+ CSTRONG!
|
|
;
|
|
|
|
definition
|
|
: ODEF^ ( text )+ CDEF!
|
|
;
|
|
|
|
code
|
|
: OCODE^ ( text )+ CCODE!
|
|
;
|
|
|
|
sample_output
|
|
: OSAMP^ ( text )+ CSAMP!
|
|
;
|
|
|
|
keyboard_text
|
|
: OKBD^ ( text )+ CKBD!
|
|
;
|
|
|
|
variable
|
|
: OVAR^ ( text )+ CVAR!
|
|
;
|
|
|
|
citation
|
|
: OCITE^ ( text )+ CCITE!
|
|
;
|
|
|
|
/* form fields (combined with body_content elsewhere so no PCDATA on end) */
|
|
form_field
|
|
: INPUT | select | textarea
|
|
;
|
|
|
|
select
|
|
: OSELECT^ (PCDATA)? (select_option)+ CSELECT!
|
|
;
|
|
|
|
select_option
|
|
: SELOPT (PCDATA)?
|
|
;
|
|
|
|
textarea
|
|
: OTAREA^ (PCDATA)? CTAREA!
|
|
;
|
|
|
|
/* special text level elements*/
|
|
anchor
|
|
: OANCHOR^ (text)* CANCHOR!
|
|
;
|
|
|
|
applet
|
|
: OAPPLET^ (APARAM)? (PCDATA)? CAPPLET!
|
|
;
|
|
|
|
//not w3-no blocks allowed; www.microsoft.com uses
|
|
font_dfn
|
|
: OFONT^ (text)* CFONT!
|
|
;
|
|
|
|
map : OMAP^ (AREA)+ CMAP!
|
|
;
|
|
|
|
class WOHTMLLexer extends Lexer;
|
|
options {
|
|
k = 4;
|
|
tokenVocabulary=WOHTML;
|
|
charVocabulary = '\3'..'\377';
|
|
caseSensitive=false;
|
|
filter=UNDEFINED_TOKEN;
|
|
}
|
|
|
|
|
|
/* STRUCTURAL tags
|
|
*/
|
|
|
|
DOCTYPE
|
|
options {
|
|
ignore=WS;
|
|
}
|
|
: "<!doctype" "html" "public" STRING '>'
|
|
;
|
|
|
|
OHTML
|
|
: "<html>"
|
|
;
|
|
|
|
CHTML
|
|
: "</html>"
|
|
;
|
|
|
|
OHEAD
|
|
: "<head>"
|
|
;
|
|
|
|
CHEAD
|
|
: "</head>"
|
|
;
|
|
|
|
OBODY
|
|
: "<body" (WS (ATTR )*)? '>'
|
|
;
|
|
|
|
CBODY
|
|
: "</body>"
|
|
;
|
|
|
|
|
|
/* HEAD ELEMENTS
|
|
*/
|
|
|
|
OTITLE
|
|
: "<title>"
|
|
;
|
|
|
|
CTITLE
|
|
: "</title>"
|
|
;
|
|
|
|
|
|
OSCRIPT
|
|
: "<script>"
|
|
;
|
|
|
|
CSCRIPT
|
|
: "</script>"
|
|
;
|
|
|
|
ISINDEX
|
|
: "<isindex" WS ATTR '>'
|
|
;
|
|
|
|
META
|
|
: "<meta" WS (ATTR)+ '>'
|
|
;
|
|
|
|
LINK
|
|
: "<link" WS (ATTR)+ '>'
|
|
;
|
|
|
|
|
|
/* headings */
|
|
|
|
OH1 : "<h1" (WS ATTR)? '>'
|
|
;
|
|
|
|
CH1 : "</h1>"
|
|
;
|
|
|
|
OH2 : "<h2" (WS ATTR)?'>'
|
|
;
|
|
|
|
CH2 : "</h2>"
|
|
;
|
|
|
|
OH3 : "<h3" (WS ATTR)? '>'
|
|
;
|
|
|
|
CH3 : "</h3>"
|
|
;
|
|
|
|
OH4 : "<h4" (WS ATTR)? '>'
|
|
;
|
|
|
|
CH4 : "</h4>"
|
|
;
|
|
|
|
OH5 : "<h5" (WS ATTR)? '>'
|
|
;
|
|
|
|
CH5 : "</h5>"
|
|
;
|
|
|
|
OH6 : "<h6" (WS ATTR)? '>'
|
|
;
|
|
|
|
CH6 : "</h6>"
|
|
;
|
|
|
|
OADDRESS
|
|
: "<address>"
|
|
;
|
|
|
|
CADDRESS
|
|
: "</address>"
|
|
;
|
|
|
|
OPARA
|
|
: "<p" (WS ATTR)? '>'
|
|
;
|
|
|
|
CPARA
|
|
: "</p>" //it's optional
|
|
;
|
|
|
|
/*UNORDERED LIST*/
|
|
OULIST
|
|
: "<ul" (WS ATTR)? '>'
|
|
;
|
|
|
|
CULIST
|
|
: "</ul>"
|
|
;
|
|
|
|
/*ORDERED LIST*/
|
|
OOLIST
|
|
: "<ol" (WS ATTR)? '>'
|
|
;
|
|
|
|
COLIST
|
|
: "</ol>"
|
|
;
|
|
|
|
/*LIST ITEM*/
|
|
|
|
OLITEM
|
|
: "<li" (WS ATTR)? '>'
|
|
;
|
|
|
|
CLITEM
|
|
: "</li>"
|
|
;
|
|
|
|
/*DEFINITION LIST*/
|
|
|
|
ODLIST
|
|
: "<dl" (WS ATTR)? '>'
|
|
;
|
|
|
|
CDLIST
|
|
: "</dl>"
|
|
;
|
|
|
|
ODTERM
|
|
: "<dt>"
|
|
;
|
|
|
|
CDTERM
|
|
: "</dt>"
|
|
;
|
|
|
|
ODDEF
|
|
: "<dd>"
|
|
;
|
|
|
|
CDDEF
|
|
: "</dd>"
|
|
;
|
|
|
|
ODIR: "<dir>"
|
|
;
|
|
|
|
CDIR_OR_CDIV
|
|
: "</di"
|
|
( 'r' {$setType(WOHTMLTokenType_CDIR);}
|
|
| 'v' {$setType(WOHTMLTokenType_CDIV);}
|
|
)
|
|
'>'
|
|
;
|
|
|
|
ODIV: "<div" (WS ATTR)? '>'
|
|
;
|
|
|
|
OMENU
|
|
: "<menu>"
|
|
;
|
|
|
|
CMENU
|
|
: "</menu>"
|
|
;
|
|
|
|
OPRE: ("<pre>" | "<xmp>") ('\n')?
|
|
;
|
|
|
|
CPRE: "</pre>" | "</xmp>"
|
|
;
|
|
|
|
OCENTER
|
|
: "<center>"
|
|
;
|
|
|
|
CCENTER
|
|
: "</center>"
|
|
;
|
|
|
|
OWEBOBJECT
|
|
: "<webobject" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
CWEBOBJECT
|
|
: "</webobject>"
|
|
;
|
|
|
|
OBQUOTE
|
|
: "<blockquote>"
|
|
;
|
|
|
|
CBQUOTE
|
|
: "</blockquote>"
|
|
;
|
|
|
|
//this is block element and thus can't be nested inside of
|
|
//other block elements, ex: paragraphs.
|
|
//Netscape appears to generate bad HTML vis-a-vis the standard.
|
|
|
|
HR : "<hr" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
|
|
OTABLE
|
|
: "<table" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
CTABLE
|
|
: "</table>"
|
|
;
|
|
|
|
OCAP: "<caption" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
CCAP: "</caption>"
|
|
;
|
|
|
|
O_TR
|
|
: "<tr" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
C_TR: "</tr>"
|
|
;
|
|
|
|
O_TH_OR_TD
|
|
: ("<th" | "<td") (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
C_TH_OR_TD
|
|
: "</th>" | "</td>"
|
|
;
|
|
|
|
/* PCDATA-LEVEL ELEMENTS
|
|
*/
|
|
|
|
/* font style elemens*/
|
|
|
|
OTTYPE
|
|
: "<tt>"
|
|
;
|
|
|
|
CTTYPE
|
|
: "</tt>"
|
|
;
|
|
|
|
OITALIC
|
|
: "<i>"
|
|
;
|
|
|
|
CITALIC
|
|
: "</i>"
|
|
;
|
|
|
|
OBOLD
|
|
: "<b>"
|
|
;
|
|
|
|
CBOLD
|
|
: "</b>"
|
|
;
|
|
|
|
OUNDER
|
|
: "<u>"
|
|
;
|
|
|
|
CUNDER
|
|
: "</u>"
|
|
;
|
|
|
|
/** Left-factor <strike> and <strong> to reduce lookahead */
|
|
OSTRIKE_OR_OSTRONG
|
|
: "<str"
|
|
( "ike" {$setType(WOHTMLTokenType_OSTRIKE);}
|
|
| "ong" {$setType(WOHTMLTokenType_OSTRONG);}
|
|
)
|
|
'>'
|
|
;
|
|
|
|
CST_LEFT_FACTORED
|
|
: "</st"
|
|
( "rike" {$setType(WOHTMLTokenType_CSTRIKE);}
|
|
| "rong" {$setType(WOHTMLTokenType_CSTRONG);}
|
|
| "yle" {$setType(WOHTMLTokenType_CSTYLE);}
|
|
)
|
|
'>'
|
|
;
|
|
|
|
OSTYLE
|
|
: "<style>"
|
|
;
|
|
|
|
OBIG: "<big>"
|
|
;
|
|
|
|
CBIG: "</big>"
|
|
;
|
|
|
|
OSMALL
|
|
: "<small>"
|
|
;
|
|
|
|
CSMALL
|
|
: "</small>"
|
|
;
|
|
|
|
OSUB: "<sub>"
|
|
;
|
|
|
|
OSUP: "<sup>"
|
|
;
|
|
|
|
CSUB_OR_CSUP
|
|
: "</su"
|
|
( 'b' {$setType(WOHTMLTokenType_CSUB);}
|
|
| 'p' {$setType(WOHTMLTokenType_CSUP);}
|
|
)
|
|
'>'
|
|
;
|
|
|
|
/* phrase elements*/
|
|
OEM : "<em>"
|
|
;
|
|
|
|
CEM : "</em>"
|
|
;
|
|
|
|
ODFN: "<dfn>"
|
|
;
|
|
|
|
CDFN: "</dfn>"
|
|
;
|
|
|
|
OCODE
|
|
: "<code>"
|
|
;
|
|
|
|
CCODE
|
|
: "</code>"
|
|
;
|
|
|
|
OSAMP
|
|
: "<samp>"
|
|
;
|
|
|
|
CSAMP
|
|
: "</samp>"
|
|
;
|
|
|
|
OKBD: "<kbd>"
|
|
;
|
|
|
|
CKBD: "</kbd>"
|
|
;
|
|
|
|
OVAR: "<var>"
|
|
;
|
|
|
|
CVAR: "</var>"
|
|
;
|
|
|
|
OCITE
|
|
: "<cite>"
|
|
;
|
|
|
|
CCYTE
|
|
: "</cite>"
|
|
;
|
|
|
|
/* form fields*/
|
|
INPUT
|
|
: "<input" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
OSELECT
|
|
: "<select" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
CSELECT
|
|
: "</select>"
|
|
;
|
|
|
|
OTAREA
|
|
: "<textarea" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
CTAREA
|
|
: "</textarea>"
|
|
;
|
|
|
|
SELOPT
|
|
: "<option" (WS (ATTR)*)? '>'
|
|
;
|
|
|
|
/* special text level elements*/
|
|
|
|
OANCHOR
|
|
: "<a" WS (ATTR)+ '>'
|
|
;
|
|
|
|
CANCHOR
|
|
: "</a>"
|
|
;
|
|
|
|
IMG : "<img" WS (ATTR)+ '>'
|
|
;
|
|
|
|
|
|
OAPPLET
|
|
: "<applet" WS (ATTR)+ '>'
|
|
;
|
|
|
|
APPLET
|
|
: "</applet>"
|
|
;
|
|
|
|
APARM
|
|
: "<param" WS (ATTR)+'>'
|
|
;
|
|
|
|
OFORM
|
|
: "<form" WS (ATTR)+ '>'
|
|
;
|
|
|
|
OFONT
|
|
: "<font" WS (ATTR)+ '>'
|
|
;
|
|
|
|
CFORM_OR_CFONT
|
|
: "</fo"
|
|
( "rm" {$setType(WOHTMLTokenType_CFORM);}
|
|
| "nt" {$setType(WOHTMLTokenType_CFONT);}
|
|
)
|
|
'>'
|
|
;
|
|
|
|
/*
|
|
CFORM
|
|
: "</form>"
|
|
;
|
|
|
|
CFONT
|
|
: "</font>"
|
|
;
|
|
*/
|
|
|
|
BFONT_OR_BASE
|
|
: "<base"
|
|
( "font" WS ATTR {$setType(WOHTMLTokenType_BFONT);}
|
|
| WS ATTR {$setType(WOHTMLTokenType_BASE);}
|
|
)
|
|
'>'
|
|
;
|
|
|
|
/*
|
|
BFONT
|
|
: "<basefont" WS ATTR '>'
|
|
;
|
|
|
|
BASE: "<base" WS ATTR '>'
|
|
;
|
|
*/
|
|
|
|
BR
|
|
: "<br" (WS ATTR)? '>'
|
|
;
|
|
|
|
OMAP
|
|
: "<map" WS ATTR '>'
|
|
;
|
|
|
|
CMAP: "</map>"
|
|
;
|
|
|
|
AREA: "<area" WS (ATTR)+ '>'
|
|
;
|
|
|
|
/*MISC STUFF*/
|
|
|
|
PCDATA
|
|
: (
|
|
/* See comment in WS. Language for combining any flavor
|
|
* newline is ambiguous. Shutting off the warning.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: '\r' '\n' {[self newline];}
|
|
| '\r' {[self newline];}
|
|
| '\n' {[self newline];}
|
|
| ~('<'|'\n'|'\r'|'"'|'>')
|
|
)+
|
|
;
|
|
|
|
protected
|
|
COMMENT_DATA
|
|
: (~('<' | '!' | '>'))+
|
|
;
|
|
|
|
COMMENT
|
|
: "<!--" (COMMENT_DATA)? "-->" { _ttype = ANTLRToken_SKIP; }
|
|
;
|
|
|
|
/*
|
|
PROTECTED LEXER RULES
|
|
*/
|
|
|
|
protected
|
|
WS : (
|
|
/* '\r' '\n' can be matched in one alternative or by matching
|
|
'\r' in one iteration and '\n' in another. I am trying to
|
|
handle any flavor of newline that comes in, but the language
|
|
that allows both "\r\n" and "\r" and "\n" to all be valid
|
|
newline is ambiguous. Consequently, the resulting grammar
|
|
must be ambiguous. I'm shutting this warning off.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: ' '
|
|
| '\t'
|
|
| '\n' { [self newline]; }
|
|
| "\r\n" { [self newline]; }
|
|
| '\r' { [self newline]; }
|
|
)+
|
|
;
|
|
|
|
protected
|
|
ATTR
|
|
options {
|
|
ignore=WS;
|
|
}
|
|
: WORD ('=' (WORD ('%')? | ('-')? INT | STRING | HEXNUM))?
|
|
;
|
|
|
|
//don't need uppercase for case-insen.
|
|
//the '.' is for words like "image.gif"
|
|
protected
|
|
WORD: ( LCLETTER
|
|
| '.'
|
|
)
|
|
|
|
(
|
|
/* In reality, a WORD must be followed by whitespace, '=', or
|
|
what can follow an ATTR such as '>'. In writing this grammar,
|
|
however, we just list all the possibilities as optional
|
|
elements. This is loose, allowing the case where nothing is
|
|
matched after a WORD and then the (ATTR)* loop means the
|
|
grammar would allow "widthheight" as WORD WORD or WORD, hence,
|
|
an ambiguity. Naturally, ANTLR will consume the input as soon
|
|
as possible, combing "widthheight" into one WORD.
|
|
|
|
I am shutting off the ambiguity here because ANTLR does the
|
|
right thing. The exit path is ambiguous with ever
|
|
alternative. The only solution would be to write an unnatural
|
|
grammar (lots of extra productions) that laid out the
|
|
possibilities explicitly, preventing the bogus WORD followed
|
|
immediately by WORD without whitespace etc...
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: LCLETTER
|
|
| DIGIT
|
|
| '.'
|
|
)+
|
|
;
|
|
|
|
protected
|
|
STRING
|
|
: '"' (~'"')* '"'
|
|
| '\'' (~'\'')* '\''
|
|
;
|
|
|
|
protected
|
|
WSCHARS
|
|
: ' ' | '\t' | '\n' | '\r'
|
|
;
|
|
|
|
protected
|
|
SPECIAL
|
|
: '<' | '~'
|
|
;
|
|
|
|
protected
|
|
HEXNUM
|
|
: '#' HEXINT
|
|
;
|
|
|
|
protected
|
|
INT : (DIGIT)+
|
|
;
|
|
|
|
protected
|
|
HEXINT
|
|
: (
|
|
/* Technically, HEXINT cannot be followed by a..f, but due to our
|
|
loose grammar, the whitespace that normally would follow this
|
|
rule is optional. ANTLR reports that #4FACE could parse as
|
|
HEXINT "#4" followed by WORD "FACE", which is clearly bogus.
|
|
ANTLR does the right thing by consuming a much input as
|
|
possible here. I shut the warning off.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: HEXDIGIT
|
|
)+
|
|
;
|
|
|
|
protected
|
|
DIGIT
|
|
: '0'..'9'
|
|
;
|
|
|
|
protected
|
|
HEXDIGIT
|
|
: '0'..'9'
|
|
| 'a'..'f'
|
|
;
|
|
|
|
protected
|
|
LCLETTER
|
|
: 'a'..'z'
|
|
;
|
|
|
|
protected
|
|
UNDEFINED_TOKEN
|
|
: '<' (~'>')* '>'
|
|
(
|
|
( /* the usual newline hassle: \r\n can be matched in alt 1
|
|
* or by matching alt 2 followed by alt 3 in another iteration.
|
|
*/
|
|
options {
|
|
generateAmbigWarnings=false;
|
|
}
|
|
: "\r\n" | '\r' | '\n'
|
|
)
|
|
{ [self newline];}
|
|
)*
|
|
{NSLog(@"invalid tag: %@",[self text]);}
|
|
| ( "\r\n" | '\r' | '\n' ) {[self newline];}
|
|
| .
|
|
;
|
|
|
|
/*
|
|
: ('<' { NSLog(@"Warning: non-standard tag <%c",(char)[self LA:1]); } )
|
|
(~'>' { NSLog(@"%c",(char)[self LA:1]);} )*
|
|
('>' { NSLog(@" skipped."); } )
|
|
{ _ttype = ANTLRToken_SKIP; }
|
|
;
|
|
*/
|
|
|