2006-02-24 04:48:15 +00:00
|
|
|
./"
|
2006-06-20 20:30:39 +00:00
|
|
|
./" $Id: re2c.1.in 523 2006-05-25 13:32:09Z helly $
|
2006-02-24 04:48:15 +00:00
|
|
|
./"
|
2006-06-20 20:30:39 +00:00
|
|
|
.TH RE2C 1 "22 April 2005" "Version 0.10.5"
|
2006-02-24 04:48:15 +00:00
|
|
|
.ds re \fBre2c\fP
|
|
|
|
.ds le \fBlex\fP
|
|
|
|
.ds rx regular expression
|
|
|
|
.ds lx \fIl\fP-expression
|
|
|
|
.SH NAME
|
|
|
|
re2c \- convert regular expressions to C/C++
|
|
|
|
|
|
|
|
.SH SYNOPSIS
|
2006-05-25 04:32:20 +00:00
|
|
|
\*(re [\fB-bdefghisvVw1\fP] [\fB-o output\fP] file\fP
|
2006-02-24 04:48:15 +00:00
|
|
|
|
|
|
|
.SH DESCRIPTION
|
|
|
|
\*(re is a preprocessor that generates C-based recognizers from regular
|
|
|
|
expressions.
|
|
|
|
The input to \*(re consists of C/C++ source interleaved with
|
|
|
|
comments of the form \fC/*!re2c\fP ... \fC*/\fP which contain
|
|
|
|
scanner specifications.
|
|
|
|
In the output these comments are replaced with code that, when
|
|
|
|
executed, will find the next input token and then execute
|
|
|
|
some user-supplied token-specific code.
|
|
|
|
|
|
|
|
For example, given the following code
|
|
|
|
|
|
|
|
.in +3
|
|
|
|
.nf
|
|
|
|
#define NULL ((char*) 0)
|
2006-05-25 04:32:20 +00:00
|
|
|
char *scan(char *p)
|
|
|
|
{
|
2006-02-24 04:48:15 +00:00
|
|
|
#define YYCTYPE char
|
|
|
|
#define YYCURSOR p
|
|
|
|
#define YYLIMIT p
|
|
|
|
#define YYFILL(n)
|
|
|
|
/*!re2c
|
|
|
|
[0-9]+ {return YYCURSOR;}
|
|
|
|
[\\000-\\377] {return NULL;}
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
.fi
|
|
|
|
.in -3
|
|
|
|
|
|
|
|
\*(re will generate
|
|
|
|
|
|
|
|
.in +3
|
|
|
|
.nf
|
|
|
|
/* Generated by re2c on Sat Apr 16 11:40:58 1994 */
|
|
|
|
#line 1 "simple.re"
|
|
|
|
#define NULL ((char*) 0)
|
2006-05-25 04:32:20 +00:00
|
|
|
char *scan(char *p)
|
|
|
|
{
|
2006-02-24 04:48:15 +00:00
|
|
|
#define YYCTYPE char
|
|
|
|
#define YYCURSOR p
|
|
|
|
#define YYLIMIT p
|
|
|
|
#define YYFILL(n)
|
|
|
|
{
|
|
|
|
YYCTYPE yych;
|
|
|
|
unsigned int yyaccept;
|
|
|
|
goto yy0;
|
|
|
|
yy1: ++YYCURSOR;
|
|
|
|
yy0:
|
|
|
|
if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
|
|
|
|
yych = *YYCURSOR;
|
|
|
|
if(yych <= '/') goto yy4;
|
|
|
|
if(yych >= ':') goto yy4;
|
|
|
|
yy2: yych = *++YYCURSOR;
|
|
|
|
goto yy7;
|
|
|
|
yy3:
|
2006-05-25 04:32:20 +00:00
|
|
|
#line 9
|
2006-02-24 04:48:15 +00:00
|
|
|
{return YYCURSOR;}
|
|
|
|
yy4: yych = *++YYCURSOR;
|
|
|
|
yy5:
|
2006-05-25 04:32:20 +00:00
|
|
|
#line 10
|
2006-02-24 04:48:15 +00:00
|
|
|
{return NULL;}
|
|
|
|
yy6: ++YYCURSOR;
|
|
|
|
if(YYLIMIT == YYCURSOR) YYFILL(1);
|
|
|
|
yych = *YYCURSOR;
|
|
|
|
yy7: if(yych <= '/') goto yy3;
|
|
|
|
if(yych <= '9') goto yy6;
|
|
|
|
goto yy3;
|
|
|
|
}
|
2006-05-25 04:32:20 +00:00
|
|
|
#line 11
|
2006-02-24 04:48:15 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
.fi
|
|
|
|
.in -3
|
|
|
|
|
2006-05-25 04:32:20 +00:00
|
|
|
You can place one \fC/*!max:re2c */\fP comment that will output a "#define
|
|
|
|
\fCYYMAXFILL\fP <n>" line that holds the maximum number of characters
|
|
|
|
required to parse the input. That is the maximum value \fCYYFILL\fP(n)
|
|
|
|
will receive. If -1 is in effect then YYMAXFILL can only be triggered once
|
|
|
|
after the last \fC/*!re2c */\fP.
|
|
|
|
|
|
|
|
You can also use \fC/*!ignore:re2c */\fP blocks that allows to document the
|
|
|
|
scanner code and will not be part of the output.
|
|
|
|
|
2006-02-24 04:48:15 +00:00
|
|
|
.SH OPTIONS
|
|
|
|
\*(re provides the following options:
|
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fB-?\fP
|
|
|
|
\fB-h\fP
|
|
|
|
Invoke a short help.
|
2006-02-24 04:48:15 +00:00
|
|
|
.TP
|
|
|
|
\fB-b\fP
|
|
|
|
Implies \fB-s\fP. Use bit vectors as well in the attempt to coax better
|
|
|
|
code out of the compiler. Most useful for specifications with more than a
|
|
|
|
few keywords (e.g. for most programming languages).
|
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fB-d\fP
|
|
|
|
Creates a parser that dumps information about the current position and in
|
|
|
|
which state the parser is while parsing the input. This is useful to debug
|
|
|
|
parser issues and states. If you use this switch you need to define a macro
|
|
|
|
\fIYYDEBUG\fP that is called like a function with two parameters:
|
|
|
|
\fIvoid YYDEBUG(int state, char current)\fP. The first parameter receives the
|
|
|
|
state or -1 and the second parameter receives the input at the current cursor.
|
2006-02-24 04:48:15 +00:00
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fB-e\fP
|
|
|
|
Cross-compile from an ASCII platform to an EBCDIC one.
|
|
|
|
.TP
|
|
|
|
\fB-f\fP
|
|
|
|
Generate a scanner with support for storable state.
|
|
|
|
For details see below at \fBSCANNER WITH STORABLE STATES\fP.
|
2006-02-24 04:48:15 +00:00
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fB-g\fP
|
|
|
|
Generate a scanner that utilizes GCC's computed goto feature. That is re2c
|
|
|
|
generates jump tables whenever a decision is of a certain complexity (e.g. a
|
|
|
|
lot of if conditions are otherwise necessary). This is only useable with GCC
|
|
|
|
and produces output that cannot be compiled with any other compiler. Note that
|
|
|
|
this implies -b and that the complexity threshold can be configured using the
|
|
|
|
inplace configuration "cgoto:threshold".
|
|
|
|
.TP
|
|
|
|
\fB-i\fP
|
|
|
|
Do not output #line information. This is usefull when you want use a CMS tool
|
|
|
|
with the re2c output which you might want if you do not require your users to
|
|
|
|
have re2c themselves when building from your source.
|
2006-02-24 04:48:15 +00:00
|
|
|
\fB-o output\fP
|
|
|
|
Specify the output file.
|
2006-05-25 04:32:20 +00:00
|
|
|
.TP
|
|
|
|
\fB-s\fP
|
|
|
|
Generate nested \fCif\fPs for some \fCswitch\fPes. Many compilers need this
|
|
|
|
assist to generate better code.
|
|
|
|
.TP
|
|
|
|
\fB-v\fP
|
|
|
|
Show version information.
|
|
|
|
.TP
|
|
|
|
\fB-V\fP
|
|
|
|
Show the version as a number XXYYZZ.
|
|
|
|
.TP
|
|
|
|
\fB-w\fP
|
|
|
|
Create a parser that supports wide chars (UCS-2). This implies \fB-s\fP and
|
|
|
|
cannot be used together with \fB-e\fP switch.
|
|
|
|
.TP
|
|
|
|
\fB-1\fP
|
|
|
|
Force single pass generation, this cannot be combined with -f and disables
|
|
|
|
YYMAXFILL generation prior to last re2c block.
|
2006-02-24 04:48:15 +00:00
|
|
|
.SH "INTERFACE CODE"
|
|
|
|
Unlike other scanner generators, \*(re does not generate complete scanners:
|
|
|
|
the user must supply some interface code.
|
|
|
|
In particular, the user must define the following macros:
|
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fCYYCTYPE\fP
|
2006-02-24 04:48:15 +00:00
|
|
|
Type used to hold an input symbol.
|
|
|
|
Usually \fCchar\fP or \fCunsigned char\fP.
|
|
|
|
.TP
|
|
|
|
\fCYYCURSOR\fP
|
2006-05-25 04:32:20 +00:00
|
|
|
\*(lx of type \fC*YYCTYPE\fP that points to the current input symbol.
|
2006-02-24 04:48:15 +00:00
|
|
|
The generated code advances \fCYYCURSOR\fP as symbols are matched.
|
|
|
|
On entry, \fCYYCURSOR\fP is assumed to point to the first character of the
|
|
|
|
current token. On exit, \fCYYCURSOR\fP will point to the first character of
|
|
|
|
the following token.
|
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fCYYLIMIT\fP
|
|
|
|
Expression of type \fC*YYCTYPE\fP that marks the end of the buffer
|
|
|
|
(\fCYYLIMIT[-1]\fP is the last character in the buffer).
|
|
|
|
The generated code repeatedly compares \fCYYCURSOR\fP to \fCYYLIMIT\fP
|
2006-02-24 04:48:15 +00:00
|
|
|
to determine when the buffer needs (re)filling.
|
|
|
|
.TP
|
|
|
|
\fCYYMARKER\fP
|
2006-05-25 04:32:20 +00:00
|
|
|
\*(lx of type \fC*YYCTYPE\fP.
|
|
|
|
The generated code saves backtracking information in \fCYYMARKER\fP. Some easy
|
|
|
|
scanners might not use this.
|
|
|
|
.TP
|
|
|
|
\fCYYCTXMARKER\fP
|
|
|
|
\*(lx of type \fC*YYCTYPE\fP.
|
|
|
|
The generated code saves trailing context backtracking information in \fCYYCTXMARKER\fP.
|
|
|
|
The user only needs to define this macro if a scanner specification uses trailing
|
|
|
|
context in one or more of its regular expressions.
|
2006-02-24 04:48:15 +00:00
|
|
|
.TP
|
|
|
|
\fCYYFILL(\fP\fIn\fP\fC)\fP
|
2006-05-25 04:32:20 +00:00
|
|
|
The generated code "calls" \fCYYFILL\fP(n) when the buffer needs
|
2006-02-24 04:48:15 +00:00
|
|
|
(re)filling: at least \fIn\fP additional characters should
|
2006-05-25 04:32:20 +00:00
|
|
|
be provided. \fCYYFILL\fP(n) should adjust \fCYYCURSOR\fP, \fCYYLIMIT\fP,
|
|
|
|
\fCYYMARKER\fP and \fCYYCTXMARKER\fP as needed. Note that for typical
|
|
|
|
programming languages \fIn\fP will be the length of the longest keyword plus one.
|
|
|
|
The user can place a comment of the form \fC/*!max:re2c */\fP once to insert
|
|
|
|
a \fCYYMAXFILL\fP definition that is set to the maximum length value. If -1
|
|
|
|
switch is used then YYMAXFILL can be triggered once after the last \fC/*!re2c */\fP
|
|
|
|
block.
|
|
|
|
.TP
|
|
|
|
\fCYYGETSTATE()\fP
|
|
|
|
The user only needs to define this macro if the \fB-f\fP flag was specified.
|
|
|
|
In that case, the generated code "calls" \fCYYGETSTATE\fP at the very beginning
|
|
|
|
of the scanner in order to obtain the saved state. YYGETSTATE must return a signed
|
|
|
|
integer. The value must be either -1, indicating that the scanner is entered for the
|
|
|
|
first time, or a value previously saved by \fCYYSETSTATE\fP. In the second case, the
|
|
|
|
scanner will resume operations right after where the last \fCYYFILL\fP(n) was called.
|
|
|
|
.TP
|
|
|
|
\fCYYSETSTATE(\fP\fIn\fP\fC)\fP
|
|
|
|
The user only needs to define this macro if the \fB-f\fP flag was specified.
|
|
|
|
In that case, the generated code "calls" \fCYYSETSTATE\fP just before calling
|
|
|
|
\fCYYFILL\fP(n). The parameter to \fCYYSETSTATE\fP is a signed integer that uniquely
|
|
|
|
identifies the specific instance of \fCYYFILL\fP(n) that is about to be called.
|
|
|
|
Should the user wish to save the state of the scanner and have \fCYYFILL\fP(n) return
|
|
|
|
to the caller, all he has to do is store that unique identifer in a variable.
|
|
|
|
Later, when the scannered is called again, it will call \fCYYGETSTATE()\fP and
|
|
|
|
resume execution right where it left off.
|
|
|
|
.TP
|
|
|
|
\fCYYDEBUG(\fP\fIstate\fP,\fIcurrent\fC)\fP
|
|
|
|
This is only needed if the \fB-d\fP flag was specified. It allows to easily debug
|
|
|
|
the generated parser by calling a user defined function for every state. The function
|
|
|
|
should have the following signature: \fIvoid YYDEBUG(int state, char current)\fP.
|
|
|
|
The first parameter receives the state or -1 and the second parameter receives the
|
|
|
|
input at the current cursor.
|
|
|
|
.TP
|
|
|
|
\fCYYMAXFILL
|
|
|
|
This will be automatically defined by \fC/*!max:re2c */\fP blocks as explained above.
|
|
|
|
|
|
|
|
.SH "SCANNER WITH STORABLE STATES"
|
|
|
|
When the \fB-f\fP flag is specified, re2c generates a scanner that
|
|
|
|
can store its current state, return to the caller, and later resume
|
|
|
|
operations exactly where it left off.
|
|
|
|
|
|
|
|
The default operation of re2c is a "pull" model, where the scanner asks
|
|
|
|
for extra input whenever it needs it. However, this mode of operation
|
|
|
|
assumes that the scanner is the "owner" the parsing loop, and that may
|
|
|
|
not always be convenient.
|
|
|
|
|
|
|
|
Typically, if there is a preprocessor ahead of the scanner in the stream,
|
|
|
|
or for that matter any other procedural source of data, the scanner cannot
|
|
|
|
"ask" for more data unless both scanner and source live in a separate threads.
|
|
|
|
|
|
|
|
The \fB-f\fP flag is useful for just this situation : it lets users design
|
|
|
|
scanners that work in a "push" model, i.e. where data is fed to the scanner
|
|
|
|
chunk by chunk. When the scanner runs out of data to consume, it just stores
|
|
|
|
its state, and return to the caller. When more input data is fed to the scanner,
|
|
|
|
it resumes operations exactly where it left off.
|
|
|
|
|
|
|
|
When using the -f option re2c does not accept stdin because it has to do the
|
|
|
|
full generation process twice which means it has to read the input twice. That
|
|
|
|
means re2c would fail in case it cannot open the input twice or reading the
|
|
|
|
input for the first time influences the second read attempt.
|
|
|
|
|
|
|
|
Changes needed compared to the "pull" model.
|
|
|
|
|
|
|
|
1. User has to supply macros YYSETSTATE() and YYGETSTATE(state)
|
|
|
|
|
|
|
|
2. The \fB-f\fP option inhibits declaration of \fIyych\fP and
|
|
|
|
\fIyyaccept\fP. So the user has to declare these. Also the user has
|
|
|
|
to save and restore these. In the example \fIexamples/push.re\fP these
|
|
|
|
are declared as fields of the (C++) class of which the scanner is a
|
|
|
|
method, so they do not need to be saved/restored explicitly. For C
|
|
|
|
they could e.g. be made macros that select fields from a structure
|
|
|
|
passed in as parameter. Alternatively, they could be declared as local
|
|
|
|
variables, saved with YYFILL(n) when it decides to return and restored
|
|
|
|
at entry to the function. Also, it could be more efficient to save the
|
|
|
|
state from YYFILL(n) because YYSETSTATE(state) is called
|
|
|
|
unconditionally. YYFILL(n) however does not get \fIstate\fP as
|
|
|
|
parameter, so we would have to store state in a local variable by
|
|
|
|
YYSETSTATE(state).
|
|
|
|
|
|
|
|
3. Modify YYFILL(n) to return (from the function calling it) if more
|
|
|
|
input is needed.
|
|
|
|
|
|
|
|
4. Modify caller to recognise "more input is needed" and respond
|
|
|
|
appropriately.
|
|
|
|
|
|
|
|
5. The generated code will contain a switch block that is used to restores
|
|
|
|
the last state by jumping behind the corrspoding YYFILL(n) call. This code is
|
|
|
|
automatically generated in the epilog of the first "\fC/*!re2c */\fP" block.
|
|
|
|
It is possible to trigger generation of the YYGETSTATE() block earlier by
|
|
|
|
placing a "\fC/*!getstate:re2c */\fP" comment. This is especially useful when
|
|
|
|
the scanner code should be wrapped inside a loop.
|
|
|
|
|
|
|
|
Please see examples/push.re for push-model scanner. The generated code can be
|
|
|
|
tweaked using inplace configurations "\fBstate:abort\fP" and "\fBstate:nextlabel\fP".
|
2006-02-24 04:48:15 +00:00
|
|
|
|
|
|
|
.SH "SCANNER SPECIFICATIONS"
|
2006-05-25 04:32:20 +00:00
|
|
|
Each scanner specification consists of a set of \fIrules\fP, \fIname
|
|
|
|
definitions\fP and \fIconfigurations\fP.
|
|
|
|
.LP
|
|
|
|
\fIRules\fP consist of a regular expression along with a block of C/C++ code that
|
|
|
|
is to be executed when the associated \fIregular expression\fP is matched.
|
|
|
|
.P
|
|
|
|
.RS
|
|
|
|
\fIregular expression\fP \fC{\fP \fIC/C++ code\fP \fC}\fP
|
|
|
|
.RE
|
|
|
|
.LP
|
|
|
|
Named definitions are of the form:
|
|
|
|
.P
|
|
|
|
.RS
|
|
|
|
\fIname\fP \fC=\fP \fIregular expression\fP\fC;\fP
|
|
|
|
.RE
|
|
|
|
.LP
|
|
|
|
Configurations look like name definitions whose names start
|
|
|
|
with "\fBre2c:\fP":
|
|
|
|
.P
|
|
|
|
.RS
|
|
|
|
\fCre2c:\fP\fIname\fP \fC=\fP \fIvalue\fP\fC;\fP
|
|
|
|
.RE
|
2006-02-24 04:48:15 +00:00
|
|
|
|
|
|
|
.SH "SUMMARY OF RE2C REGULAR EXPRESSIONS"
|
|
|
|
.TP
|
|
|
|
\fC"foo"\fP
|
|
|
|
the literal string \fCfoo\fP.
|
|
|
|
ANSI-C escape sequences can be used.
|
|
|
|
.TP
|
|
|
|
\fC'foo'\fP
|
|
|
|
the literal string \fCfoo\fP (characters [a-zA-Z] treated case-insensitive).
|
|
|
|
ANSI-C escape sequences can be used.
|
|
|
|
.TP
|
|
|
|
\fC[xyz]\fP
|
|
|
|
a "character class"; in this case,
|
|
|
|
the \*(rx matches either an '\fCx\fP', a '\fCy\fP', or a '\fCz\fP'.
|
|
|
|
.TP
|
|
|
|
\fC[abj-oZ]\fP
|
|
|
|
a "character class" with a range in it;
|
|
|
|
matches an '\fCa\fP', a '\fCb\fP', any letter from '\fCj\fP' through '\fCo\fP',
|
|
|
|
or a '\fCZ\fP'.
|
|
|
|
.TP
|
2006-05-25 04:32:20 +00:00
|
|
|
\fC[^\fIclass\fP\fC]\fP
|
|
|
|
an inverted "character class".
|
|
|
|
.TP
|
2006-02-24 04:48:15 +00:00
|
|
|
\fIr\fP\fC\e\fP\fIs\fP
|
|
|
|
match any \fIr\fP which isn't an \fIs\fP. \fIr\fP and \fIs\fP must be regular expressions
|
|
|
|
which can be expressed as character classes.
|
|
|
|
.TP
|
|
|
|
\fIr\fP\fC*\fP
|
|
|
|
zero or more \fIr\fP's, where \fIr\fP is any regular expression
|
|
|
|
.TP
|
|
|
|
\fC\fIr\fP\fC+\fP
|
|
|
|
one or more \fIr\fP's
|
|
|
|
.TP
|
|
|
|
\fC\fIr\fP\fC?\fP
|
|
|
|
zero or one \fIr\fP's (that is, "an optional \fIr\fP")
|
|
|
|
.TP
|
|
|
|
name
|
|
|
|
the expansion of the "name" definition (see above)
|
|
|
|
.TP
|
|
|
|
\fC(\fP\fIr\fP\fC)\fP
|
|
|
|
an \fIr\fP; parentheses are used to override precedence
|
|
|
|
(see below)
|
|
|
|
.TP
|
|
|
|
\fIrs\fP
|
|
|
|
an \fIr\fP followed by an \fIs\fP ("concatenation")
|
|
|
|
.TP
|
|
|
|
\fIr\fP\fC|\fP\fIs\fP
|
|
|
|
either an \fIr\fP or an \fIs\fP
|
|
|
|
.TP
|
|
|
|
\fIr\fP\fC/\fP\fIs\fP
|
2006-05-25 04:32:20 +00:00
|
|
|
an \fIr\fP but only if it is followed by an \fIs\fP. The \fIs\fP is not part of
|
|
|
|
the matched text. This type of \*(rx is called "trailing context". A trailing
|
|
|
|
context can only be the end of a rule and not part of a named definition.
|
2006-02-24 04:48:15 +00:00
|
|
|
.TP
|
|
|
|
\fIr\fP\fC{\fP\fIn\fP\fC}\fP
|
|
|
|
matches \fIr\fP exactly \fIn\fP times.
|
|
|
|
.TP
|
|
|
|
\fIr\fP\fC{\fP\fIn\fP\fC,}\fP
|
|
|
|
matches \fIr\fP at least \fIn\fP times.
|
|
|
|
.TP
|
|
|
|
\fIr\fP\fC{\fP\fIn\fP\fC,\fP\fIm\fP\fC}\fP
|
|
|
|
matches \fIr\fP at least \fIn\fP but not more than \fIm\fP times.
|
2006-05-25 04:32:20 +00:00
|
|
|
.TP
|
|
|
|
\fC.\fP
|
|
|
|
match any character except newline (\\n).
|
|
|
|
.TP
|
|
|
|
\fIdef\fP
|
|
|
|
matches named definition as specified by \fIdef\fP.
|
|
|
|
.LP
|
|
|
|
Character classes and string literals may contain octoal or hexadecimal
|
|
|
|
character definitions and the following set of escape sequences (\fB\\n\fP,
|
|
|
|
\fB\\t\fP, \fB\\v\fP, \fB\\b\fP, \fB\\r\fP, \fB\\f\fP, \fB\\a\fP, \fB\\\\\fP).
|
|
|
|
An octal character is defined by a backslash followed by its three octal digits
|
|
|
|
and a hexadecimal character is defined by backslash, a lower cased '\fBx\fP'
|
|
|
|
and its two hexadecimal digits or a backslash, an upper cased \fBX\fP and its
|
|
|
|
four hexadecimal digits.
|
|
|
|
.LP
|
|
|
|
re2c
|
|
|
|
further more supports the c/c++ unicode notation. That is a backslash followed
|
|
|
|
by either a lowercased \fBu\fP and its four hexadecimal digits or an uppercased
|
|
|
|
\fBU\fP and its eight hexadecimal digits. However using the U notation it is
|
|
|
|
not possible to support characters greater \fB\\U0000FFFF\fP due to an internal
|
|
|
|
limitation of re2c.
|
|
|
|
.LP
|
|
|
|
Since characters greater \fB\\X00FF\fP are not allowed in non unicode mode, the
|
|
|
|
only portable "\fBany\fP" rules are \fB(.|"\\n")\fP and \fB[^]\fP.
|
2006-02-24 04:48:15 +00:00
|
|
|
.LP
|
|
|
|
The regular expressions listed above are grouped according to
|
|
|
|
precedence, from highest precedence at the top to lowest at the bottom.
|
|
|
|
Those grouped together have equal precedence.
|
|
|
|
|
2006-05-25 04:32:20 +00:00
|
|
|
.SH "INPLACE CONFIGURATION"
|
2006-02-24 04:48:15 +00:00
|
|
|
.LP
|
2006-05-25 04:32:20 +00:00
|
|
|
It is possible to configure code generation inside re2c blocks. The following
|
|
|
|
lists the available configurations:
|
|
|
|
.TP
|
|
|
|
\fIre2c:indent:top\fP \fB=\fP 0 \fB;\fP
|
|
|
|
Specifies the minimum number of indendation to use. Requires a numeric value
|
|
|
|
greater than or equal zero.
|
|
|
|
.TP
|
|
|
|
\fIre2c:indent:string\fP \fB=\fP "\\t" \fB;\fP
|
|
|
|
Specifies the string to use for indendation. Requires a string that should
|
|
|
|
contain only whitespace unless you need this for external tools. The easiest
|
|
|
|
way to specify spaces is to enclude them in single or double quotes. If you do
|
|
|
|
not want any indendation at all you can simply set this to \fB""\fP.
|
|
|
|
.TP
|
|
|
|
\fIre2c:yybm:hex\fP \fB=\fP 0 \fB;\fP
|
|
|
|
If set to zero then a decimal table is being used else a hexadecimal table
|
|
|
|
will be generated.
|
|
|
|
.TP
|
|
|
|
\fIre2c:yyfill:enable\fP \fB=\fP 1 \fB;\fP
|
|
|
|
Set this to zero to suppress generation of YYFILL(n). When using this be sure
|
|
|
|
to verify that the generated scanner does not read behind input. Allowing
|
|
|
|
this behavior might introduce sever security issues to you programs.
|
|
|
|
.TP
|
|
|
|
\fIre2c:startlabel\fP \fB=\fP 0 \fB;\fP
|
|
|
|
If set to a non zero integer then the start label of the next scanner blocks
|
|
|
|
will be generated even if not used by the scanner itself. Otherwise the normal
|
|
|
|
\fByy0\fP like start label is only being generated if needed. If set to a text
|
|
|
|
value then a label with that text will be generated regardless of whether the
|
|
|
|
normal start label is being used or not. This setting is being reset to \fB0\fP
|
|
|
|
after a start label has been generated.
|
|
|
|
.TP
|
|
|
|
\fIre2c:state:abort\fP \fB=\fP 0 \fB;\fP
|
|
|
|
When not zero and switch -f is active then the \fCYYGETSTATE\fP block will
|
|
|
|
contain a default case that aborts and a -1 case is used for initialization.
|
|
|
|
.TP
|
|
|
|
\fIre2c:state:nextlabel\fP \fB=\fP 0 \fB;\fP
|
|
|
|
Used when -f is active to control whether the \fCYYGETSTATE\fP block is
|
|
|
|
followed by a \fCyyNext:\fP label line. Instead of using \fCyyNext\fP you can
|
|
|
|
usually also use configuration \fIstartlabel\fP to force a specific start label
|
|
|
|
or default to \fCyy0\fP as start label. Instead of using a dedicated label it
|
|
|
|
is often better to separate the YYGETSTATE code from the actual scanner code by
|
|
|
|
placing a "\fC/*!getstate:re2c */\fP" comment.
|
|
|
|
.TP
|
|
|
|
\fIre2c:cgoto:threshold\fP \fB=\fP 9 \fB;\fP
|
|
|
|
When -g is active this value specifies the complexity threshold that triggers
|
|
|
|
generation of jump tables rather than using nested if's and decision bitfields.
|
|
|
|
The threshold is compared against a calculated estimation of if-s needed where
|
|
|
|
every used bitmap divides the threshold by 2.
|
2006-02-24 04:48:15 +00:00
|
|
|
|
2006-05-25 04:32:20 +00:00
|
|
|
.SH "UNDERSTANDING RE2C"
|
|
|
|
.LP
|
|
|
|
The subdirectory lessons of the re2c distribution contains a few step by step
|
|
|
|
lessons to get you started with re2c. All examples in the lessons subdirectory
|
|
|
|
can be compiled and actually work.
|
2006-02-24 04:48:15 +00:00
|
|
|
|
|
|
|
.SH FEATURES
|
|
|
|
.LP
|
|
|
|
\*(re does not provide a default action:
|
|
|
|
the generated code assumes that the input
|
|
|
|
will consist of a sequence of tokens.
|
|
|
|
Typically this can be dealt with by adding a rule such as the one for
|
|
|
|
unexpected characters in the example above.
|
|
|
|
.LP
|
|
|
|
The user must arrange for a sentinel token to appear at the end of input
|
|
|
|
(and provide a rule for matching it):
|
|
|
|
\*(re does not provide an \fC<<EOF>>\fP expression.
|
|
|
|
If the source is from a null-byte terminated string, a
|
|
|
|
rule matching a null character will suffice. If the source is from a
|
2006-05-25 04:32:20 +00:00
|
|
|
file then you could pad the input with a newline (or some other character that
|
|
|
|
cannot appear within another token); upon recognizing such a character check
|
|
|
|
to see if it is the sentinel and act accordingly. And you can also use YYFILL(n)
|
|
|
|
to end the scanner in case not enough characters are available which is nothing
|
|
|
|
else then e detection of end of data/file.
|
2006-02-24 04:48:15 +00:00
|
|
|
.LP
|
|
|
|
\*(re does not provide start conditions: use a separate scanner
|
|
|
|
specification for each start condition (as illustrated in the above example).
|
2006-05-25 04:32:20 +00:00
|
|
|
|
2006-02-24 04:48:15 +00:00
|
|
|
.SH BUGS
|
|
|
|
.LP
|
|
|
|
Difference only works for character sets.
|
|
|
|
.LP
|
|
|
|
The \*(re internal algorithms need documentation.
|
|
|
|
|
|
|
|
.SH "SEE ALSO"
|
|
|
|
.LP
|
|
|
|
flex(1), lex(1).
|
|
|
|
.P
|
|
|
|
More information on \fBre2c\fP can be found here:
|
|
|
|
.PD 0
|
|
|
|
.P
|
|
|
|
.B http://sourceforge.net/projects/re2c/
|
|
|
|
.PD 1
|
|
|
|
|
|
|
|
.SH AUTHORS
|
|
|
|
.PD 0
|
|
|
|
.P
|
|
|
|
Peter Bumbulis <peter@csg.uwaterloo.ca>
|
|
|
|
.P
|
|
|
|
Brian Young <bayoung@acm.org>
|
|
|
|
.P
|
|
|
|
Dan Nuffer <nuffer@users.sourceforge.net>
|
|
|
|
.P
|
|
|
|
Marcus Boerger <helly@users.sourceforge.net>
|
|
|
|
.P
|
|
|
|
Hartmut Kaiser <hkaiser@users.sourceforge.net>
|
|
|
|
.P
|
2006-05-25 04:32:20 +00:00
|
|
|
Emmanuel Mogenet <mgix@mgix.com> added storable state
|
|
|
|
.P
|
2006-02-24 04:48:15 +00:00
|
|
|
.PD 1
|
|
|
|
|
|
|
|
.SH VERSION INFORMATION
|
2006-06-20 20:30:39 +00:00
|
|
|
This manpage describes \fBre2c\fP, version 0.10.5.
|
2006-02-24 04:48:15 +00:00
|
|
|
|
|
|
|
.fi
|