/* * A push-model scanner example for re2c -f * Written Mon Apr 11 2005 by mgix@mgix.com * This file is in the public domain. * */ // ---------------------------------------------------------------------- #include <fcntl.h> #include <stdio.h> #include <stddef.h> #include <stdlib.h> #include <string.h> #if defined(WIN32) typedef signed char int8_t; typedef signed short int16_t; typedef signed int int32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; #else #include <stdint.h> #include <unistd.h> #ifndef O_BINARY #define O_BINARY 0 #endif #endif // ---------------------------------------------------------------------- #define TOKENS \ \ TOK(kEOF) \ TOK(kEOL) \ TOK(kUnknown) \ TOK(kIdentifier) \ TOK(kDecimalConstant) \ \ TOK(kEqual) \ TOK(kLeftParen) \ TOK(kRightParen) \ TOK(kMinus) \ TOK(kPlus) \ TOK(kStar) \ TOK(kSlash) \ \ TOK(kIf) \ TOK(kFor) \ TOK(kElse) \ TOK(kGoto) \ TOK(kBreak) \ TOK(kWhile) \ TOK(kReturn) \ // ---------------------------------------------------------------------- static const char *tokenNames[] = { #define TOK(x) #x, TOKENS #undef TOK }; // ---------------------------------------------------------------------- class PushScanner { public: enum Token { #define TOK(x) x, TOKENS #undef TOK }; private: bool eof; int32_t state; uint8_t *limit; uint8_t *start; uint8_t *cursor; uint8_t *marker; uint8_t *buffer; uint8_t *bufferEnd; uint8_t yych; uint32_t yyaccept; public: // ---------------------------------------------------------------------- PushScanner() { limit = 0; start = 0; state = -1; cursor = 0; marker = 0; buffer = 0; eof = false; bufferEnd = 0; } // ---------------------------------------------------------------------- ~PushScanner() { } // ---------------------------------------------------------------------- void send( Token token ) { size_t tokenSize = cursor-start; const char *tokenName = tokenNames[token]; printf( "scanner is pushing out a token of type %d (%s)", token, tokenName ); if(token==kEOF) putchar('\n'); else { size_t tokenNameSize = strlen(tokenNames[token]); size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize); for(size_t i=0; i<padSize; ++i) putchar(' '); printf(" : ---->"); fwrite( start, tokenSize, 1, stdout ); printf("<----\n"); } } // ---------------------------------------------------------------------- uint32_t push( const void *input, ssize_t inputSize ) { printf( "scanner is receiving a new data batch of length %d\n" "scanner continues with saved state = %d\n", inputSize, state ); /* * Data source is signaling end of file when batch size * is less than maxFill. This is slightly annoying because * maxFill is a value that can only be known after re2c does * its thing. Practically though, maxFill is never bigger than * the longest keyword, so given our grammar, 32 is a safe bet. */ uint8_t null[64]; const ssize_t maxFill = 32; if(inputSize<maxFill) { eof = true; input = null; inputSize = sizeof(null); memset(null, 0, sizeof(null)); } /* * When we get here, we have a partially * consumed buffer which is in the following state: * last valid char last valid buffer spot * v v * +-------------------+-------------+---------------+-------------+----------------------+ * ^ ^ ^ ^ ^ ^ * buffer start marker cursor limit bufferEnd * * We need to stretch the buffer and concatenate the new chunk of input to it * */ size_t used = limit-buffer; size_t needed = used+inputSize; size_t allocated = bufferEnd-buffer; if(allocated<needed) { size_t limitOffset = limit-buffer; size_t startOffset = start-buffer; size_t markerOffset = marker-buffer; size_t cursorOffset = cursor-buffer; buffer = (uint8_t*)realloc(buffer, needed); bufferEnd = needed+buffer; marker = markerOffset + buffer; cursor = cursorOffset + buffer; start = buffer + startOffset; limit = limitOffset + buffer; } memcpy(limit, input, inputSize); limit += inputSize; // The scanner starts here #define YYLIMIT limit #define YYCURSOR cursor #define YYMARKER marker #define YYCTYPE uint8_t #define SKIP(x) { start = cursor; goto yy0; } #define SEND(x) { send(x); SKIP(); } #define YYFILL(n) { goto fill; } #define YYGETSTATE() state #define YYSETSTATE(x) { state = (x); } start: /*!re2c re2c:startlabel = 1; eol = "\n"; eof = "\000"; digit = [0-9]; integer = digit+; alpha = [A-Za-z_]; any = [\000-\377]; space = [ \h\t\v\f\r]; "if" { SEND(kIf); } "for" { SEND(kFor); } "else" { SEND(kElse); } "goto" { SEND(kGoto); } "break" { SEND(kBreak); } "while" { SEND(kWhile); } "return" { SEND(kReturn); } alpha (alpha|digit)* { SEND(kIdentifier); } integer { SEND(kDecimalConstant);} "=" { SEND(kEqual); } "(" { SEND(kLeftParen); } ")" { SEND(kRightParen); } "-" { SEND(kMinus); } "+" { SEND(kPlus); } "*" { SEND(kStar); } "/" { SEND(kSlash); } eol { SKIP(); } space { SKIP(); } eof { send(kEOF); return 1; } any { SEND(kUnknown); } */ fill: ssize_t unfinishedSize = cursor-start; printf( "scanner needs a refill. Exiting for now with:\n" " saved fill state = %d\n" " unfinished token size = %d\n", state, unfinishedSize ); if(0<unfinishedSize && start<limit) { printf(" unfinished token is :"); fwrite(start, 1, cursor-start, stdout); putchar('\n'); } putchar('\n'); /* * Once we get here, we can get rid of * everything before start and after limit. */ if(eof==true) goto start; if(buffer<start) { size_t startOffset = start-buffer; memmove(buffer, start, limit-start); marker -= startOffset; cursor -= startOffset; limit -= startOffset; start -= startOffset; } return 0; } }; // ---------------------------------------------------------------------- int main( int argc, char **argv ) { // Parse cmd line int input = 0; if(1<argc) { input = open(argv[1], O_RDONLY | O_BINARY); if(input<0) { fprintf( stderr, "could not open file %s\n", argv[1] ); exit(1); } } /* * Tokenize input file by pushing batches * of data one by one into the scanner. */ const size_t batchSize = 256; uint8_t buffer[batchSize]; PushScanner scanner; while(1) { ssize_t n = read(input, buffer, batchSize); scanner.push(buffer, n); if(n<batchSize) break; } scanner.push(0, -1); close(input); // Done return 0; }