gmqcc/lex.c

/*
 * Copyright (C) 2012 
 * 	Dale Weiler
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do
 * so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "gmqcc.h"

/*
 * Keywords are multichar, punctuation lexing is a bit more complicated
 * than keyword lexing.
 */
static const char *const lex_keywords[] = {
	"do",    "else",     "if",     "while",
	"break", "continue", "return", "goto",
	"for",
	
	/* types */
	"int",
	"void",
	"string",
	"float",
	"vector",
	"entity",
};

struct lex_file *lex_open(FILE *fp) {
	struct lex_file *lex = mem_a(sizeof(struct lex_file));
	if (lex) {
		lex->file = fp;
		fseek(lex->file, 0, SEEK_END);
		lex->length = ftell(lex->file);
		lex->size   = lex->length; /* copy, this is never changed */
		fseek(lex->file, 0, SEEK_SET);
		lex->last = 0;
		
		memset(lex->peek, 0, sizeof(lex->peek));
	}
	return lex;
}

int lex_close(struct lex_file *file) {
	int ret = -1;
	if (file) {
		ret = fclose(file->file);
		mem_d(file);
	}
	return ret;
}

static void lex_addch(int ch, struct lex_file *file) {
	if (file->current <  sizeof(file->lastok)-1)
		file->lastok[file->current++] = (char)ch;
	if (file->current == sizeof(file->lastok)-1)
		file->lastok[file->current]   = (char)'\0';
}
static inline void lex_clear(struct lex_file *file) {
	file->current = 0;
}

/*
 * read in inget/unget character from a lexer stream.
 * This doesn't play with file streams, the lexer has
 * it's own internal state for this.
 */
static int lex_inget(struct lex_file *file) {
	file->length --;
	if (file->last > 0)
		return file->peek[--file->last];
	return fgetc(file->file);
}
static void lex_unget(int ch, struct lex_file *file) {
	if (file->last < sizeof(file->peek))
		file->peek[file->last++] = ch;
	file->length ++;
}

/*
 * This is trigraph and digraph support, a feature not qc compiler
 * supports.  Moving up in this world!
 */
static int lex_trigraph(struct lex_file *file) {
	int  ch;
	if ((ch = lex_inget(file)) != '?') {
		lex_unget(ch, file);
		return '?';
	}
	
	ch = lex_inget(file);
	switch (ch) {
		case '(' : return '[' ;
		case ')' : return ']' ;
		case '/' : return '\\';
		case '\'': return '^' ;
		case '<' : return '{' ;
		case '>' : return '}' ;
		case '!' : return '|' ;
		case '-' : return '~' ;
		case '=' : return '#' ;
		default:
			lex_unget('?', file);
			lex_unget(ch , file);
			return '?';
	}
	return '?';
}
static int lex_digraph(struct lex_file *file, int first) {
	int ch = lex_inget(file);
	switch (first) {
		case '<':
			if (ch == '%') return '{';
			if (ch == ':') return '[';
			break;
		case '%':
			if (ch == '>') return '}';
			if (ch == ':') return '#';
			break;
		case ':':
			if (ch == '>') return ']';
			break;
	}
	
	lex_unget(ch, file);
	return first;
}

static int lex_getch(struct lex_file *file) {
	int ch = lex_inget(file);
	if (ch == '?')
		return lex_trigraph(file);
	if (ch == '<' || ch == ':' || ch == '%')
		return lex_digraph (file, ch);
		
	return ch;
}

static int lex_get(struct lex_file *file) {
	int ch;
	if (!isspace(ch = lex_getch(file)))
		return ch;
	
	/* skip over all spaces */
	while (isspace(ch) && ch != '\n')
		ch = lex_getch(file);
		
	if (ch == '\n')
		return ch;
		
	lex_unget(ch, file);
	return ' ';
}

static int lex_skipchr(struct lex_file *file) {
	int ch;
	int it;
	
	lex_clear(file);
	lex_addch('\'', file);
	
	for (it = 0; it < 2 && ((ch = lex_inget(file)) != '\''); it++) {
		lex_addch(ch, file);
		
		if (ch == '\n')
			return ERROR_LEX;
		if (ch == '\\')
			lex_addch(lex_getch(file), file);
	}
	lex_addch('\'', file);
	lex_addch('\0', file);
	
	if (it > 2)
		return ERROR_LEX;
		
	return LEX_CHRLIT;
}

static int lex_skipstr(struct lex_file *file) {
	int ch;
	lex_clear(file);
	lex_addch('"', file);
	
	while ((ch = lex_getch(file)) != '"') {
		if (ch == '\n' || ch == EOF)
			return ERROR_LEX;
			
		lex_addch(ch, file);
		if (ch == '\\')
			lex_addch(lex_inget(file), file);
	}
	
	lex_addch('"', file);
	lex_addch('\0', file);
	
	return LEX_STRLIT;
}
static int lex_skipcmt(struct lex_file *file) {
	int ch;
	lex_clear(file);
	ch = lex_getch(file);
	
	if (ch == '/') {
		lex_addch('/', file);
		lex_addch('/', file);
		
		while ((ch = lex_getch(file)) != '\n') {
			if (ch == '\\') {
				lex_addch(ch, file);
				lex_addch(lex_getch(file), file);
			} else {
				lex_addch(ch, file);
			}
		}
		lex_addch('\0', file);
		return LEX_COMMENT;
	}
	
	if (ch != '*') {
		lex_unget(ch, file);
		return '/';
	}
	
	lex_addch('/', file);
	
	/* hate this */
	do {
		lex_addch(ch, file);
		while ((ch = lex_getch(file)) != '*') {
			if (ch == EOF)
				return error(ERROR_LEX, "malformatted comment", " ");
			else
				lex_addch(ch, file);
		}
		lex_addch(ch, file);
	} while ((ch = lex_getch(file)) != '/');
	
	lex_addch('/',  file);
	lex_addch('\0', file);
	
	return LEX_COMMENT;
}

static int lex_getsource(struct lex_file *file) {
	int ch = lex_get(file);
	
	/* skip char/string/comment */
	switch (ch) {
		case '\'': return lex_skipchr(file);
		case '"':  return lex_skipstr(file);
		case '/':  return lex_skipcmt(file);
		default:   return ch;
	}
}

int lex_token(struct lex_file *file) {
	int ch = lex_getsource(file);
	int it;
	
	/* valid identifier */
	if (ch > 0 && (ch == '_' || isalpha(ch))) {
		lex_clear(file);
		while (ch > 0 && (isalpha(ch) || ch == '_')) {
			lex_addch(ch, file);
			ch = lex_getsource(file);
		}
		lex_unget(ch,   file);
		lex_addch('\0', file);
		
		/* look inside the table for a keyword .. */
		for (it = 0; it < sizeof(lex_keywords)/sizeof(*lex_keywords); it++)
			if (!strncmp(file->lastok, lex_keywords[it], sizeof(lex_keywords[it])))
				return it;
				
		return LEX_IDENT;
	}
	return ch;
}

void lex_reset(struct lex_file *file) {
	file->current = 0;
	file->last    = 0;
	file->length  = file->size;
	fseek(file->file, 0, SEEK_SET);
	
	memset(file->peek,   0, sizeof(file->peek  ));
	memset(file->lastok, 0, sizeof(file->lastok));
}

int lex_debug(struct lex_file *file) {
	int list_do       = 0;
	int list_else     = 0;
	int list_if       = 0;
	int list_while    = 0;
	int list_break    = 0;
	int list_continue = 0;
	int list_return   = 0;
	int list_goto     = 0;
	int list_for      = 0;
	int token         = 0;
	printf("===========================\nTOKENS:   \n===========================\n");
	while ((token = lex_token(file)) != ERROR_LEX && file->length >= 0) {
		if (token != -1) {
			switch (token) {
				case 0: list_do      ++; break;
				case 1: list_else    ++; break;
				case 2: list_if      ++; break;
				case 3: list_while   ++; break;
				case 4: list_break   ++; break;
				case 5: list_continue++; break;
				case 6: list_return  ++; break;
				case 7: list_goto    ++; break;
				case 8: list_for     ++; break;
			}
		}
		if (token >= 33 && token <= 126)
			putchar(token);
	}
	printf("\n===========================\nBRANCHES \n===========================\n");
	printf("\t if       % 8d\n", list_if);
	printf("\t else     % 8d\n", list_else);
	printf("===========================\nLOOPS      \n===========================\n");
	printf("\t for      % 8d\n", list_for);
	printf("\t while    % 8d\n", list_while);
	printf("\t do       % 8d\n", list_do);
	printf("===========================\nSTATEMENTS \n===========================\n");
	printf("\t break    % 8d\n", list_break);
	printf("\t continue % 8d\n", list_continue);
	printf("\t return   % 8d\n", list_return);
	printf("\t goto     % 8d\n", list_goto);
	printf("===========================\nIDENTIFIERS\n===========================\n");
	lex_reset(file);
	while ((token = lex_token(file)) != ERROR_LEX && file->length >= 0)
		if (token == LEX_IDENT)
			printf("%s ", file->lastok);
	fputc('\n', stdout);
	lex_reset(file);
	return 1;
}
initial commit 2012-04-09 10:42:06 +00:00			`/*`
			`* Copyright (C) 2012`
			`* Dale Weiler`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a copy of`
			`* this software and associated documentation files (the "Software"), to deal in`
			`* the Software without restriction, including without limitation the rights to`
			`* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies`
			`* of the Software, and to permit persons to whom the Software is furnished to do`
			`* so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in all`
			`* copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*/`
			`#include <stdio.h>`
			`#include <limits.h>`
			`#include <stdlib.h>`
			`#include <ctype.h>`
			`#include <string.h>`
			`#include "gmqcc.h"`

More parsing & parse tree 2012-04-09 23:00:13 +00:00			`/*`
			`* Keywords are multichar, punctuation lexing is a bit more complicated`
			`* than keyword lexing.`
			`*/`
initial commit 2012-04-09 10:42:06 +00:00			`static const char *const lex_keywords[] = {`
			`"do", "else", "if", "while",`
			`"break", "continue", "return", "goto",`
Some new types, and lexer changes 2012-04-09 11:45:20 +00:00			`"for",`

			`/* types */`
			`"int",`
			`"void",`
			`"string",`
			`"float",`
			`"vector",`
Work in progress preprocessor 2012-04-09 13:36:16 +00:00			`"entity",`
initial commit 2012-04-09 10:42:06 +00:00			`};`

Work in progress preprocessor 2012-04-09 13:36:16 +00:00			`struct lex_file lex_open(FILE fp) {`
initial commit 2012-04-09 10:42:06 +00:00			`struct lex_file *lex = mem_a(sizeof(struct lex_file));`
			`if (lex) {`
Work in progress preprocessor 2012-04-09 13:36:16 +00:00			`lex->file = fp;`
initial commit 2012-04-09 10:42:06 +00:00			`fseek(lex->file, 0, SEEK_END);`
			`lex->length = ftell(lex->file);`
			`lex->size = lex->length; /* copy, this is never changed */`
			`fseek(lex->file, 0, SEEK_SET);`
			`lex->last = 0;`

			`memset(lex->peek, 0, sizeof(lex->peek));`
			`}`
			`return lex;`
			`}`

			`int lex_close(struct lex_file *file) {`
			`int ret = -1;`
			`if (file) {`
			`ret = fclose(file->file);`
			`mem_d(file);`
			`}`
			`return ret;`
			`}`

			`static void lex_addch(int ch, struct lex_file *file) {`
			`if (file->current < sizeof(file->lastok)-1)`
			`file->lastok[file->current++] = (char)ch;`
			`if (file->current == sizeof(file->lastok)-1)`
			`file->lastok[file->current] = (char)'\0';`
			`}`
			`static inline void lex_clear(struct lex_file *file) {`
			`file->current = 0;`
			`}`

			`/*`
			`* read in inget/unget character from a lexer stream.`
			`* This doesn't play with file streams, the lexer has`
			`* it's own internal state for this.`
			`*/`
			`static int lex_inget(struct lex_file *file) {`
			`file->length --;`
			`if (file->last > 0)`
			`return file->peek[--file->last];`
			`return fgetc(file->file);`
			`}`
			`static void lex_unget(int ch, struct lex_file *file) {`
			`if (file->last < sizeof(file->peek))`
			`file->peek[file->last++] = ch;`
			`file->length ++;`
			`}`

			`/*`
			`* This is trigraph and digraph support, a feature not qc compiler`
			`* supports. Moving up in this world!`
			`*/`
			`static int lex_trigraph(struct lex_file *file) {`
			`int ch;`
			`if ((ch = lex_inget(file)) != '?') {`
			`lex_unget(ch, file);`
			`return '?';`
			`}`

			`ch = lex_inget(file);`
			`switch (ch) {`
			`case '(' : return '[' ;`
			`case ')' : return ']' ;`
			`case '/' : return '\\';`
			`case '\'': return '^' ;`
			`case '<' : return '{' ;`
			`case '>' : return '}' ;`
			`case '!' : return '\|' ;`
			`case '-' : return '~' ;`
			`case '=' : return '#' ;`
			`default:`
			`lex_unget('?', file);`
			`lex_unget(ch , file);`
			`return '?';`
			`}`
			`return '?';`
			`}`
			`static int lex_digraph(struct lex_file *file, int first) {`
			`int ch = lex_inget(file);`
			`switch (first) {`
			`case '<':`
			`if (ch == '%') return '{';`
			`if (ch == ':') return '[';`
			`break;`
			`case '%':`
			`if (ch == '>') return '}';`
			`if (ch == ':') return '#';`
			`break;`
			`case ':':`
			`if (ch == '>') return ']';`
			`break;`
			`}`

			`lex_unget(ch, file);`
			`return first;`
			`}`

			`static int lex_getch(struct lex_file *file) {`
			`int ch = lex_inget(file);`
			`if (ch == '?')`
			`return lex_trigraph(file);`
			`if (ch == '<' \|\| ch == ':' \|\| ch == '%')`
			`return lex_digraph (file, ch);`

			`return ch;`
			`}`

			`static int lex_get(struct lex_file *file) {`
			`int ch;`
			`if (!isspace(ch = lex_getch(file)))`
			`return ch;`

			`/* skip over all spaces */`
			`while (isspace(ch) && ch != '\n')`
			`ch = lex_getch(file);`

			`if (ch == '\n')`
			`return ch;`

			`lex_unget(ch, file);`
			`return ' ';`
			`}`

			`static int lex_skipchr(struct lex_file *file) {`
			`int ch;`
			`int it;`

			`lex_clear(file);`
			`lex_addch('\'', file);`

			`for (it = 0; it < 2 && ((ch = lex_inget(file)) != '\''); it++) {`
			`lex_addch(ch, file);`

			`if (ch == '\n')`
			`return ERROR_LEX;`
			`if (ch == '\\')`
			`lex_addch(lex_getch(file), file);`
			`}`
			`lex_addch('\'', file);`
			`lex_addch('\0', file);`

			`if (it > 2)`
			`return ERROR_LEX;`

			`return LEX_CHRLIT;`
			`}`

			`static int lex_skipstr(struct lex_file *file) {`
			`int ch;`
			`lex_clear(file);`
			`lex_addch('"', file);`

			`while ((ch = lex_getch(file)) != '"') {`
			`if (ch == '\n' \|\| ch == EOF)`
			`return ERROR_LEX;`

			`lex_addch(ch, file);`
			`if (ch == '\\')`
			`lex_addch(lex_inget(file), file);`
			`}`

			`lex_addch('"', file);`
			`lex_addch('\0', file);`

			`return LEX_STRLIT;`
			`}`
			`static int lex_skipcmt(struct lex_file *file) {`
			`int ch;`
			`lex_clear(file);`
			`ch = lex_getch(file);`

			`if (ch == '/') {`
			`lex_addch('/', file);`
			`lex_addch('/', file);`

			`while ((ch = lex_getch(file)) != '\n') {`
			`if (ch == '\\') {`
			`lex_addch(ch, file);`
			`lex_addch(lex_getch(file), file);`
			`} else {`
			`lex_addch(ch, file);`
			`}`
			`}`
			`lex_addch('\0', file);`
			`return LEX_COMMENT;`
			`}`

			`if (ch != '*') {`
			`lex_unget(ch, file);`
			`return '/';`
			`}`

			`lex_addch('/', file);`

			`/* hate this */`
			`do {`
			`lex_addch(ch, file);`
			`while ((ch = lex_getch(file)) != '*') {`
			`if (ch == EOF)`
Work in progress preprocessor 2012-04-09 13:36:16 +00:00			`return error(ERROR_LEX, "malformatted comment", " ");`
initial commit 2012-04-09 10:42:06 +00:00			`else`
			`lex_addch(ch, file);`
			`}`
			`lex_addch(ch, file);`
			`} while ((ch = lex_getch(file)) != '/');`

			`lex_addch('/', file);`
			`lex_addch('\0', file);`

			`return LEX_COMMENT;`
			`}`

			`static int lex_getsource(struct lex_file *file) {`
			`int ch = lex_get(file);`

			`/* skip char/string/comment */`
			`switch (ch) {`
			`case '\'': return lex_skipchr(file);`
			`case '"': return lex_skipstr(file);`
			`case '/': return lex_skipcmt(file);`
			`default: return ch;`
			`}`
			`}`

			`int lex_token(struct lex_file *file) {`
			`int ch = lex_getsource(file);`
			`int it;`

			`/* valid identifier */`
			`if (ch > 0 && (ch == '_' \|\| isalpha(ch))) {`
			`lex_clear(file);`
Work in progress preprocessor 2012-04-09 13:36:16 +00:00			`while (ch > 0 && (isalpha(ch) \|\| ch == '_')) {`
initial commit 2012-04-09 10:42:06 +00:00			`lex_addch(ch, file);`
			`ch = lex_getsource(file);`
			`}`
			`lex_unget(ch, file);`
			`lex_addch('\0', file);`

			`/* look inside the table for a keyword .. */`
			`for (it = 0; it < sizeof(lex_keywords)/sizeof(*lex_keywords); it++)`
			`if (!strncmp(file->lastok, lex_keywords[it], sizeof(lex_keywords[it])))`
			`return it;`

			`return LEX_IDENT;`
			`}`
			`return ch;`
			`}`

			`void lex_reset(struct lex_file *file) {`
			`file->current = 0;`
			`file->last = 0;`
			`file->length = file->size;`
			`fseek(file->file, 0, SEEK_SET);`

			`memset(file->peek, 0, sizeof(file->peek ));`
			`memset(file->lastok, 0, sizeof(file->lastok));`
			`}`

			`int lex_debug(struct lex_file *file) {`
			`int list_do = 0;`
			`int list_else = 0;`
			`int list_if = 0;`
			`int list_while = 0;`
			`int list_break = 0;`
			`int list_continue = 0;`
			`int list_return = 0;`
			`int list_goto = 0;`
			`int list_for = 0;`
			`int token = 0;`
			`printf("===========================\nTOKENS: \n===========================\n");`
			`while ((token = lex_token(file)) != ERROR_LEX && file->length >= 0) {`
			`if (token != -1) {`
			`switch (token) {`
			`case 0: list_do ++; break;`
			`case 1: list_else ++; break;`
			`case 2: list_if ++; break;`
			`case 3: list_while ++; break;`
			`case 4: list_break ++; break;`
			`case 5: list_continue++; break;`
			`case 6: list_return ++; break;`
			`case 7: list_goto ++; break;`
			`case 8: list_for ++; break;`
			`}`
			`}`
			`if (token >= 33 && token <= 126)`
			`putchar(token);`
			`}`
			`printf("\n===========================\nBRANCHES \n===========================\n");`
			`printf("\t if % 8d\n", list_if);`
			`printf("\t else % 8d\n", list_else);`
			`printf("===========================\nLOOPS \n===========================\n");`
			`printf("\t for % 8d\n", list_for);`
			`printf("\t while % 8d\n", list_while);`
			`printf("\t do % 8d\n", list_do);`
			`printf("===========================\nSTATEMENTS \n===========================\n");`
			`printf("\t break % 8d\n", list_break);`
			`printf("\t continue % 8d\n", list_continue);`
			`printf("\t return % 8d\n", list_return);`
			`printf("\t goto % 8d\n", list_goto);`
			`printf("===========================\nIDENTIFIERS\n===========================\n");`
			`lex_reset(file);`
			`while ((token = lex_token(file)) != ERROR_LEX && file->length >= 0)`
			`if (token == LEX_IDENT)`
			`printf("%s ", file->lastok);`
Some parsing 2012-04-09 14:14:26 +00:00			`fputc('\n', stdout);`
initial commit 2012-04-09 10:42:06 +00:00			`lex_reset(file);`
			`return 1;`
			`}`