libs-gui/TextConverters/RTF/rtfScanner.m

530 lines
13 KiB
Objective-C

/* rtcScanner
Copyright (C) 1999 Free Software Foundation, Inc.
Author: Stefan Boehringer (stefan.boehringer@uni-bochum.de)
Date: Dec 1999
This file is part of the GNUstep GUI Library.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; see the file COPYING.LIB.
If not, see <http://www.gnu.org/licenses/> or write to the
Free Software Foundation, 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "rtfScanner.h"
#include "rtfGrammar.tab.h"
// <ยง> scanner types and helpers
#define CArraySize(a) (sizeof(a)/sizeof((a)[0])-1)
typedef struct {
char *bf;
int length, position, chunkSize;
} DynamicString;
typedef struct {
const char *string;
int token;
} LexKeyword;
GSLexError initDynamicString (DynamicString *string)
{
string->length = 0, string->position = 0, string->chunkSize = 128;
string->bf = calloc(1, string->length = string->chunkSize);
if (!string->bf)
{
return LEXoutOfMemory;
}
return NoError;
}
GSLexError appendChar (DynamicString *string, int c)
{
if (string->position == string->length)
{
if (!(string->bf = realloc(string->bf,
string->length += string->chunkSize)))
{
return LEXoutOfMemory;
}
else
{
string->chunkSize <<= 1;
}
}
string->bf[string->position++] = c;
return NoError;
}
void lexInitContext (RTFscannerCtxt *lctxt, void *customContext,
int (*getcharFunction)(void *))
{
lctxt->streamLineNumber = 1;
lctxt->streamPosition = lctxt->pushbackCount = 0;
lctxt->lgetchar = getcharFunction;
lctxt->customContext = customContext;
}
int lexGetchar (RTFscannerCtxt *lctxt)
{
int c;
if (lctxt->pushbackCount)
{
lctxt->pushbackCount--;
c = lctxt->pushbackBuffer[lctxt->pushbackCount];
}
else
{
lctxt->streamPosition++;
c = lctxt->lgetchar(lctxt->customContext);
}
if (c == '\n')
{
lctxt->streamLineNumber++;
}
return c;
}
void lexUngetchar (RTFscannerCtxt *lctxt, int c)
{
if (c == '\n')
{
lctxt->streamLineNumber--;
}
lctxt->pushbackBuffer[lctxt->pushbackCount++] = c; //<!> no checking here
}
int lexStreamPosition (RTFscannerCtxt *lctxt)
{
return lctxt->streamPosition - lctxt->pushbackCount;
}
char *my_strdup (const char *str)
{
char *copy = str? malloc (strlen (str) + 1): 0;
return !copy? 0: strcpy(copy, str);
}
int findStringFromKeywordArray(const char *string, const LexKeyword *array,
int arrayCount)
{
int min, max, mid, cmp;
const LexKeyword *currentKeyword;
for (min = 0, max = arrayCount; min <= max; )
{
mid = (min + max)>>1;
currentKeyword = array + mid;
if (!(cmp = strcmp (string, currentKeyword->string)))
{
return currentKeyword->token;
}
else if (cmp > 0)
{
min = mid + 1;
}
else
{
max = mid - 1;
}
}
return 0; // couldn't find
}
// end <ยง> scanner types and helpers
// <ยง> core scanner functions
#define token(a) (a)
// <!> must be sorted
LexKeyword RTFcommands[] =
{
{"NeXTGraphic",token(RTFNeXTGraphic)},
{"NeXTHelpLink",token(RTFNeXTHelpLink)},
{"NeXTHelpMarker",token(RTFNeXTHelpMarker)},
{"ansi", token(RTFansi)},
{"ansicpg", token(RTFansicpg)},
{"b", token(RTFbold)},
{"blue", token(RTFblue)},
{"bullet", token(RTFbullet)},
{"cb", token(RTFcolorbg)},
{"cell", token(RTFcell)},
{"cf", token(RTFcolorfg)},
{"colortbl", token(RTFcolortable)},
{"cpg", token(RTFcpg)},
{"dn", token(RTFsubscript)},
{"emdash", token(RTFemdash)},
{"emspace", token(RTFemspace)},
{"endash", token(RTFendash)},
{"enspace", token(RTFenspace)},
{"f", token(RTFfont)},
{"fbidi", token(RTFfamilyBiDi)},
{"fcharset", token(RTFfcharset)},
{"fdecor", token(RTFfamilyDecor)},
{"fi", token(RTFfirstLineIndent)},
{"field", token(RTFfield)},
{"filename", token(RTFNeXTfilename)},
{"fldalt", token(RTFfldalt)},
{"flddirty", token(RTFflddirty)},
{"fldedit", token(RTFfldedit)},
{"fldinst", token(RTFfldinst)},
{"fldlock", token(RTFfldlock)},
{"fldpriv", token(RTFfldpriv)},
{"fldrslt", token(RTFfldrslt)},
{"fmodern", token(RTFfamilyModern)},
{"fnil", token(RTFfamilyNil)},
{"fonttbl", token(RTFfontListStart)},
/* All footers are mapped on one entry */
{"footer", token(RTFfooter)},
{"footerf", token(RTFfooter)},
{"footerl", token(RTFfooter)},
{"footerr", token(RTFfooter)},
{"footnote", token(RTFfootnote)},
{"fprq", token(RTFfprq)},
{"froman", token(RTFfamilyRoman)},
{"fs", token(RTFfontSize)},
{"fscript", token(RTFfamilyScript)},
{"fswiss", token(RTFfamilySwiss)},
{"ftech", token(RTFfamilyTech)},
{"fttruetype", token(RTFfttruetype)},
{"green", token(RTFgreen)},
/* All headers are mapped on one entry */
{"header", token(RTFheader)},
{"headerf", token(RTFheader)},
{"headerl", token(RTFheader)},
{"headerr", token(RTFheader)},
{"height", token(RTFNeXTGraphicHeight)},
{"i", token(RTFitalic)},
{"info", token(RTFinfo)},
{"ldblquote", token(RTFldblquote)},
{"li", token(RTFleftIndent)},
{"linkFilename",token(RTFNeXTlinkFilename)},
{"linkMarkername",token(RTFNeXTlinkMarkername)},
{"lquote", token(RTFlquote)},
{"mac", token(RTFmac)},
{"margb", token(RTFmarginButtom)},
{"margl", token(RTFmarginLeft)},
{"margr", token(RTFmarginRight)},
{"margt", token(RTFmarginTop)},
{"markername",token(RTFNeXTmarkername)},
{"paperh", token(RTFpaperHeight)},
{"paperw", token(RTFpaperWidth)},
{"par", token(RTFparagraph)},
{"pard", token(RTFdefaultParagraph)},
{"pc", token(RTFpc)},
{"pca", token(RTFpca)},
{"pict", token(RTFpict)},
{"plain", token(RTFplain)},
{"qc", token(RTFalignCenter)},
{"qj", token(RTFalignJustified)},
{"ql", token(RTFalignLeft)},
{"qr", token(RTFalignRight)},
{"rdblquote", token(RTFrdblquote)},
{"red", token(RTFred)},
{"ri", token(RTFrightIndent)},
{"row", token(RTFrow)},
{"rquote", token(RTFrquote)},
{"rtf", token(RTFstart)},
{"s", token(RTFstyle)},
{"sa", token(RTFspaceAbove)},
{"sl", token(RTFlineSpace)},
{"strike", token(RTFstrikethrough)},
{"striked1", token(RTFstrikethroughDouble)},
{"stylesheet",token(RTFstylesheet)},
{"tab", token(RTFtabulator)},
{"tx", token(RTFtabstop)},
{"u", token(RTFunichar)},
{"ul", token(RTFunderline)},
{"ulc", token(RTFunderlinecolor)},
{"uld", token(RTFunderlineDot)},
{"uldash", token(RTFunderlineDash)},
{"uldashd", token(RTFunderlineDashDot)},
{"uldashdd", token(RTFunderlineDashDotDot)},
{"uldb", token(RTFunderlineDouble)},
{"ulnone", token(RTFunderlineStop)},
{"ulth", token(RTFunderlineThick)},
{"ulthd", token(RTFunderlineThickDot)},
{"ulthdash", token(RTFunderlineThickDash)},
{"ulthdashd", token(RTFunderlineThickDashDot)},
{"ulthdashdd",token(RTFunderlineThickDashDotDot)},
{"ulw", token(RTFunderlineWord)},
{"up", token(RTFsuperscript)},
{"width", token(RTFNeXTGraphicWidth)}
};
BOOL probeCommand (RTFscannerCtxt *lctxt)
{
int c = lexGetchar(lctxt);
lexUngetchar (lctxt, c);
if (isalpha(c))
{
return YES;
}
return NO;
}
// <N> According to spec a cmdLength of 32 is respected
#define RTFMaxCmdLength 32
#define RTFMaxArgumentLength 64
GSLexError readCommand (RTFscannerCtxt *lctxt,
YYSTYPE *lvalp,
int *token) // the '\\' is already read
{
char cmdNameBf[RTFMaxCmdLength+1], *cmdName = cmdNameBf;
char argumentBf[RTFMaxArgumentLength+1], *argument = argumentBf;
int c, foundToken;
lvalp->cmd.name = 0; // initialize
while (isalpha (c = lexGetchar(lctxt)))
{
*cmdName++ = c;
if (cmdName >= cmdNameBf + RTFMaxCmdLength)
{
return LEXsyntaxError;
}
}
*cmdName = 0;
if (!(foundToken = findStringFromKeywordArray(cmdNameBf, RTFcommands,
CArraySize(RTFcommands))))
{
if (!(lvalp->cmd.name = my_strdup(cmdNameBf)))
{
return LEXoutOfMemory;
}
*token = RTFOtherStatement;
}
else
{
*token = foundToken;
}
if (c == ' ') // this is an empty argument
{
lvalp->cmd.isEmpty = YES;
}
else if (isdigit(c) || c == '-') // we've found a numerical argument
{
do
{
*argument++ = c;
if (argument >= argumentBf + RTFMaxArgumentLength)
{
return LEXsyntaxError;
}
} while (isdigit(c = lexGetchar(lctxt)));
*argument = 0;
if (c != ' ')
{
lexUngetchar(lctxt, c); // <N> ungetc non-digit
}
// the consumption of the space seems necessary on NeXT but
// is not according to spec
lvalp->cmd.isEmpty = NO;
lvalp->cmd.parameter = atoi(argumentBf);
}
else
{
lvalp->cmd.isEmpty = YES;
lexUngetchar(lctxt, c); // ungetc non-whitespace delimiter
}
return NoError;
}
GSLexError readText (RTFscannerCtxt *lctxt, YYSTYPE *lvalp)
{
int c;
DynamicString text;
GSLexError error;
if ((error = initDynamicString(&text)))
{
return error;
}
for (;;)
{
c = lexGetchar(lctxt);
if (c == EOF || c == '{' || c == '}' || c == '\\')
{
lexUngetchar(lctxt, c);
break;
}
else
{
// <N> newline and cr are ignored if not quoted
if (c != '\n' && c != '\r')
{
appendChar(&text, c);
}
}
}
appendChar(&text, 0);
lvalp->text = text.bf; // release is up to the consumer
return NoError;
}
// read in a character as two hex digit
static int gethex(RTFscannerCtxt *lctxt)
{
int c = 0;
int i;
for (i = 0; i < 2; i++)
{
int c1 = lexGetchar(lctxt);
if (!isxdigit(c1))
{
lexUngetchar(lctxt, c1);
break;
}
else
{
c = c * 16;
if (isdigit(c1))
{
c += c1 - '0';
}
else if (isupper(c1))
{
c += c1 - 'A' + 10;
}
else
{
c += c1 - 'a' + 10;
}
}
}
return c;
}
int GSRTFlex (YYSTYPE *lvalp, //YYLTYPE *llocp,
RTFscannerCtxt *lctxt) /* provide value and position in the params */
{
int c;
int token = 0;
char *cv;
do
{
c = lexGetchar(lctxt);
}
while ( c == '\n' || c == '\r' ); // <A> the listed characters are to be ignored
switch (c)
{
case EOF:
{
token = 0;
break;
}
case '{':
{
token = '{';
break;
}
case '}':
{
token = '}';
break;
}
case '\\':
if (probeCommand(lctxt) == YES)
{
readCommand(lctxt, lvalp, &token);
switch (token)
{
case RTFtabulator: c = '\t';
break;
case RTFcell: c = '\t';
break;
case RTFemdash: c = '-';
break;
case RTFendash: c = '-';
break;
case RTFbullet: c = '*';
break;
case RTFlquote: c = '`';
break;
case RTFrquote: c = '\'';
break;
case RTFldblquote: c = '"';
break;
case RTFrdblquote: c = '"';
break;
default:
return token;
}
}
else
{
c = lexGetchar(lctxt);
switch (c)
{
case EOF: token = 0;
return token;
case '\'':
// Convert the next two hex digits into a char
c = gethex(lctxt);
break;
case '*': return RTFignore;
case '|':
case '-':
case ':':
// Ignore these characters
c = lexGetchar(lctxt);
break;
case '_': c = '-';
break;
case '~': c = ' ';
break;
case '\n':
case '\r':
return RTFparagraph;
case '{':
case '}':
case '\\':
// release is up to the consumer
cv = calloc(1, 2);
cv[0] = c;
cv[1] = '\0';
lvalp->text = cv;
token = RTFtext;
return token;
default:
// fall through
break;
}
}
// else fall through to default: read text <A>
// no break <A>
default:
lexUngetchar(lctxt, c);
readText(lctxt, lvalp);
token = RTFtext;
break;
}
//*llocp = lctxt->position();
return token;
}