Copying my old lexer

2025-03-03 07:11:20 +00:00 · 2012-07-16 13:59:10 +02:00 · 2012-07-16 13:59:10 +02:00 · 9b6598f049
commit 9b6598f049
parent 3943e27ebb
6 changed files with 855 additions and 100 deletions
--- a/3
+++ b/3
@ -17,14 +17,13 @@ ifeq ($(CC), clang)

 endif
 OBJ     = \
-          error.o     \
          util.o      \
          code.o      \
          ast.o       \
          ir.o
 OBJ_A = test/ast-test.o
 OBJ_I = test/ir-test.o
-OBJ_C = main.o
+OBJ_C = main.o lexer.o parser.o

 #default is compiler only
 default: gmqcc
--- a/gmqcc.h
+++ b/gmqcc.h
@ -179,95 +179,6 @@ typedef char int64_size_is_correct  [sizeof(int64_t)  == 8?1:-1];
 typedef char uintptr_size_is_correct[sizeof(intptr_t) == sizeof(int*)?1:-1];
 typedef char intptr_size_is_correct [sizeof(uintptr_t)== sizeof(int*)?1:-1];

-/*===================================================================*/
-/*============================ lex.c ================================*/
-/*===================================================================*/
-typedef struct lex_file_t {
-    FILE *file;        /* file handler */
-    char *name;        /* name of file */
-    char  peek  [5];
-    char  lastok[8192];
-
-    int   last;    /* last token                   */
-    int   current; /* current token                */
-    int   length;  /* bytes left to parse          */
-    int   size;    /* never changes (size of file) */
-    int   line;    /* what line are we on?         */
-} lex_file;
-
-/*
- * It's important that this table never exceed 32 keywords, the ascii
- * table starts at 33 (and we don't want conflicts)
- */
-enum {
-    TOKEN_DO       ,
-    TOKEN_ELSE     ,
-    TOKEN_IF       ,
-    TOKEN_WHILE    ,
-    TOKEN_BREAK    ,
-    TOKEN_CONTINUE ,
-    TOKEN_RETURN   ,
-    TOKEN_GOTO     ,
-    TOKEN_FOR      ,   /* extension */
-    TOKEN_TYPEDEF  ,   /* extension */
-
-    /* ensure the token types are out of the  */
-    /* bounds of anyothers that may conflict. */
-    TOKEN_FLOAT    = 110,
-    TOKEN_VECTOR        ,
-    TOKEN_STRING        ,
-    TOKEN_ENTITY        ,
-    TOKEN_VOID
-};
-
-/*
- * Lexer state constants, these are numbers for where exactly in
- * the lexing the lexer is at. Or where it decided to stop if a lexer
- * error occurs.  These numbers must be > where the ascii-table ends
- * and > the last type token which is TOKEN_VOID
- */
-enum {
-    LEX_COMMENT = 1128,
-    LEX_CHRLIT        ,
-    LEX_STRLIT        ,
-    LEX_IDENT
-};
-
-int       lex_token  (lex_file *);
-void      lex_reset  (lex_file *);
-void      lex_close  (lex_file *);
-void      lex_parse  (lex_file *);
-lex_file *lex_include(lex_file *, const char *);
-void      lex_init   (const char *, lex_file **);
-
-/*===================================================================*/
-/*========================== error.c ================================*/
-/*===================================================================*/
-#define ERROR_LEX      (SHRT_MAX+0)
-#define ERROR_PARSE    (SHRT_MAX+1)
-#define ERROR_INTERNAL (SHRT_MAX+2)
-#define ERROR_COMPILER (SHRT_MAX+3)
-#define ERROR_PREPRO   (SHRT_MAX+4)
-int error(lex_file *, int, const char *, ...);
-
-/*===================================================================*/
-/*========================== parse.c ================================*/
-/*===================================================================*/
-int parse_gen(lex_file *);
-
-/*===================================================================*/
-/*========================== typedef.c ==============================*/
-/*===================================================================*/
-typedef struct typedef_node_t {
-    char      *name;
-} typedef_node;
-
-void          typedef_init();
-void          typedef_clear();
-typedef_node *typedef_find(const char *);
-int           typedef_add (lex_file *file, const char *, const char *);
-
-
 /*===================================================================*/
 /*=========================== util.c ================================*/
 /*===================================================================*/
@ -364,7 +275,7 @@ enum {
    TYPE_FIELD    ,
    TYPE_FUNCTION ,
    TYPE_POINTER  ,
-    /* TYPE_INTEGER  , */
+    TYPE_INTEGER  ,
    TYPE_QUATERNION  ,
    TYPE_MATRIX  ,
    TYPE_VARIANT  ,
--- a/lexer.c
+++ b/lexer.c
@ -0,0 +1,632 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "gmqcc.h"
+#include "lexer.h"
+
+MEM_VEC_FUNCTIONS(token, char, value)
+
+void lexerror(lex_file *lex, const char *fmt, ...)
+{
+	va_list ap;
+
+	if (lex)
+		printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
+	else
+		printf("error: ");
+
+	va_start(ap, fmt);
+	vprintf(fmt, ap);
+	va_end(ap);
+
+	printf("\n");
+}
+
+token* token_new()
+{
+	token *tok = (token*)mem_a(sizeof(token));
+	if (!tok)
+		return NULL;
+	memset(tok, 0, sizeof(*tok));
+	return tok;
+}
+
+void token_delete(token *self)
+{
+	if (self->next && self->next->prev == self)
+		self->next->prev = self->prev;
+	if (self->prev && self->prev->next == self)
+		self->prev->next = self->next;
+	MEM_VECTOR_CLEAR(self, value);
+	mem_d(self);
+}
+
+token* token_copy(const token *cp)
+{
+	token* self = token_new();
+	if (!self)
+		return NULL;
+	/* copy the value */
+	self->value_alloc = cp->value_count + 1;
+	self->value_count = cp->value_count;
+	self->value = (char*)mem_a(self->value_alloc);
+	if (!self->value) {
+		mem_d(self);
+		return NULL;
+	}
+	memcpy(self->value, cp->value, cp->value_count);
+	self->value[self->value_alloc-1] = 0;
+
+	/* rest */
+	self->ctx = cp->ctx;
+	self->ttype = cp->ttype;
+	memcpy(&self->constval, &cp->constval, sizeof(self->constval));
+	return self;
+}
+
+void token_delete_all(token *t)
+{
+	token *n;
+
+	do {
+		n = t->next;
+		token_delete(t);
+		t = n;
+	} while(t);
+}
+
+token* token_copy_all(const token *cp)
+{
+	token *cur;
+	token *out;
+
+	out = cur = token_copy(cp);
+	if (!out)
+		return NULL;
+
+	while (cp->next) {
+		cp = cp->next;
+		cur->next = token_copy(cp);
+		if (!cur->next) {
+			token_delete_all(out);
+			return NULL;
+		}
+		cur->next->prev = cur;
+		cur = cur->next;
+	}
+
+	return out;
+}
+
+lex_file* lex_open(const char *file)
+{
+	lex_file *lex;
+	FILE *in = fopen(file, "rb");
+
+	if (!in) {
+		lexerror(NULL, "open failed: '%s'\n", file);
+		return NULL;
+	}
+
+	lex = (lex_file*)mem_a(sizeof(*lex));
+	if (!lex) {
+		fclose(in);
+		lexerror(NULL, "out of memory\n");
+		return NULL;
+	}
+
+	memset(lex, 0, sizeof(*lex));
+
+	lex->file = in;
+	lex->name = util_strdup(file);
+	lex->line = 1; /* we start counting at 1 */
+
+	lex->peekpos = 0;
+
+	return lex;
+}
+
+void lex_close(lex_file *lex)
+{
+	if (lex->file)
+		fclose(lex->file);
+	if (lex->tok)
+		token_delete(lex->tok);
+	mem_d(lex->name);
+	mem_d(lex);
+}
+
+/* Get or put-back data
+ * The following to functions do NOT understand what kind of data they
+ * are working on.
+ * The are merely wrapping get/put in order to count line numbers.
+ */
+static int lex_getch(lex_file *lex)
+{
+	int ch;
+
+	if (lex->peekpos) {
+		lex->peekpos--;
+		if (lex->peek[lex->peekpos] == '\n')
+			lex->line++;
+		return lex->peek[lex->peekpos];
+	}
+
+	ch = fgetc(lex->file);
+	if (ch == '\n')
+		lex->line++;
+	return ch;
+}
+
+static void lex_ungetch(lex_file *lex, int ch)
+{
+	lex->peek[lex->peekpos++] = ch;
+	if (ch == '\n')
+		lex->line--;
+}
+
+/* classify characters
+ * some additions to the is*() functions of ctype.h
+ */
+
+/* Idents are alphanumberic, but they start with alpha or _ */
+static bool isident_start(int ch)
+{
+	return isalpha(ch) || ch == '_';
+}
+
+static bool isident(int ch)
+{
+	return isident_start(ch) || isdigit(ch);
+}
+
+/* isxdigit_only is used when we already know it's not a digit
+ * and want to see if it's a hex digit anyway.
+ */
+static bool isxdigit_only(int ch)
+{
+	return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
+}
+
+/* Skip whitespace and comments and return the first
+ * non-white character.
+ * As this makes use of the above getch() ungetch() functions,
+ * we don't need to care at all about line numbering anymore.
+ *
+ * In theory, this function should only be used at the beginning
+ * of lexing, or when we *know* the next character is part of the token.
+ * Otherwise, if the parser throws an error, the linenumber may not be
+ * the line of the error, but the line of the next token AFTER the error.
+ *
+ * This is currently only problematic when using c-like string-continuation,
+ * since comments and whitespaces are allowed between 2 such strings.
+ * Example:
+printf(   "line one\n"
+// A comment
+          "A continuation of the previous string"
+// This line is skipped
+      , foo);
+
+ * In this case, if the parse decides it didn't actually want a string,
+ * and uses lex->line to print an error, it will show the ', foo);' line's
+ * linenumber.
+ *
+ * On the other hand, the parser is supposed to remember the line of the next
+ * token's beginning. In this case we would want skipwhite() to be called
+ * AFTER reading a token, so that the parser, before reading the NEXT token,
+ * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
+ *
+ * THIS SOLUTION
+ *    here is to store the line of the first character after skipping
+ *    the initial whitespace in lex->sline, this happens in lex_do.
+ */
+static int lex_skipwhite(lex_file *lex)
+{
+	int ch = 0;
+
+	do
+	{
+		ch = lex_getch(lex);
+		while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
+
+		if (ch == '/') {
+			ch = lex_getch(lex);
+			if (ch == '/')
+			{
+				/* one line comment */
+				ch = lex_getch(lex);
+
+				/* check for special: '/', '/', '*', '/' */
+				if (ch == '*') {
+					ch = lex_getch(lex);
+					if (ch == '/') {
+						ch = ' ';
+						continue;
+					}
+				}
+
+				while (ch != EOF && ch != '\n') {
+					ch = lex_getch(lex);
+				}
+				continue;
+			}
+			if (ch == '*')
+			{
+				/* multiline comment */
+				while (ch != EOF)
+				{
+					ch = lex_getch(lex);
+					if (ch == '*') {
+						ch = lex_getch(lex);
+						if (ch == '/') {
+							ch = lex_getch(lex);
+							break;
+						}
+					}
+				}
+				if (ch == '/') /* allow *//* direct following comment */
+				{
+					lex_ungetch(lex, ch);
+					ch = ' '; /* cause TRUE in the isspace check */
+				}
+				continue;
+			}
+			/* Otherwise roll back to the slash and break out of the loop */
+			lex_ungetch(lex, ch);
+			ch = '/';
+			break;
+		}
+	} while (ch != EOF && isspace(ch));
+
+	return ch;
+}
+
+/* Append a character to the token buffer */
+static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
+{
+	if (!token_value_add(lex->tok, ch)) {
+		lexerror(lex, "out of memory");
+		return false;
+	}
+	return true;
+}
+
+/* Append a trailing null-byte */
+static bool GMQCC_WARN lex_endtoken(lex_file *lex)
+{
+	if (!token_value_add(lex->tok, 0)) {
+		lexerror(lex, "out of memory");
+		return false;
+	}
+	lex->tok->value_count--;
+	return true;
+}
+
+/* Get a token */
+static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
+{
+	int ch;
+
+	ch = lex_getch(lex);
+	while (ch != EOF && isident(ch))
+	{
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		ch = lex_getch(lex);
+	}
+
+	/* last ch was not an ident ch: */
+	lex_ungetch(lex, ch);
+
+	return true;
+}
+
+static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
+{
+	int ch = 0;
+
+	while (ch != EOF)
+	{
+		ch = lex_getch(lex);
+		if (ch == quote)
+			return TOKEN_STRINGCONST;
+
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+
+		/* as lexer we only care about \" to not terminate the string prematurely */
+		if (ch == '\\') {
+			ch = lex_getch(lex);
+			if (ch == EOF) {
+				lexerror(lex, "unexpected end of file");
+				lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
+				return (lex->tok->ttype = TOKEN_ERROR);
+			}
+			/* so we just add the next character no matter what it actually is */
+			if (!lex_tokench(lex, ch))
+				return (lex->tok->ttype = TOKEN_FATAL);
+		}
+	}
+	lexerror(lex, "unexpected end of file within string constant");
+	lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
+	return (lex->tok->ttype = TOKEN_ERROR);
+}
+
+static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
+{
+	bool ishex = false;
+
+	int  ch = lastch;
+
+	/* parse a number... */
+	lex->tok->ttype = TOKEN_INTCONST;
+
+	if (!lex_tokench(lex, ch))
+		return (lex->tok->ttype = TOKEN_FATAL);
+
+	ch = lex_getch(lex);
+	if (ch != '.' && !isdigit(ch))
+	{
+		if (lastch != '0' || ch != 'x')
+		{
+			/* end of the number or EOF */
+			lex_ungetch(lex, ch);
+			if (!lex_endtoken(lex))
+				return (lex->tok->ttype = TOKEN_FATAL);
+
+			lex->tok->constval.i = lastch - '0';
+			return lex->tok->ttype;
+		}
+
+		ishex = true;
+	}
+
+	/* EOF would have been caught above */
+
+	if (ch != '.')
+	{
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		ch = lex_getch(lex);
+		while (isdigit(ch) || (ishex && isxdigit_only(ch)))
+		{
+			if (!lex_tokench(lex, ch))
+				return (lex->tok->ttype = TOKEN_FATAL);
+			ch = lex_getch(lex);
+		}
+	}
+	/* NOT else, '.' can come from above as well */
+	if (ch == '.' && !ishex)
+	{
+		/* Allow floating comma in non-hex mode */
+		lex->tok->ttype = TOKEN_FLOATCONST;
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+
+		/* continue digits-only */
+		ch = lex_getch(lex);
+		while (isdigit(ch))
+		{
+			if (!lex_tokench(lex, ch))
+				return (lex->tok->ttype = TOKEN_FATAL);
+			ch = lex_getch(lex);
+		}
+	}
+	/* put back the last character */
+	/* but do not put back the trailing 'f' or a float */
+	if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
+		ch = lex_getch(lex);
+
+	/* generally we don't want words to follow numbers: */
+	if (isident(ch)) {
+		lexerror(lex, "unexpected trailing characters after number");
+		return (lex->tok->ttype = TOKEN_ERROR);
+	}
+	lex_ungetch(lex, ch);
+
+	if (!lex_endtoken(lex))
+		return (lex->tok->ttype = TOKEN_FATAL);
+	if (lex->tok->ttype == TOKEN_FLOATCONST)
+		lex->tok->constval.f = strtod(lex->tok->value, NULL);
+	else
+		lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
+	return lex->tok->ttype;
+}
+
+int lex_do(lex_file *lex)
+{
+	int ch, nextch;
+
+	if (lex->tok)
+		token_delete(lex->tok);
+	lex->tok = token_new();
+	if (!lex->tok)
+		return TOKEN_FATAL;
+
+	ch = lex_skipwhite(lex);
+	lex->sline = lex->line;
+	lex->tok->ctx.line = lex->sline;
+	lex->tok->ctx.file = lex->name;
+
+	if (ch == EOF)
+		return (lex->tok->ttype = TOKEN_EOF);
+
+	/* single-character tokens */
+	switch (ch)
+	{
+		case ';':
+		case '(':
+		case ')':
+		case '{':
+		case '}':
+		case '[':
+		case ']':
+
+		case ',':
+
+			return (lex->tok->ttype = ch);
+		default:
+			break;
+	}
+
+	if (lex->flags.noops)
+	{
+		/* Detect characters early which are normally
+		 * operators OR PART of an operator.
+		 */
+		switch (ch)
+		{
+			case '+':
+			case '-':
+			case '*':
+			case '/':
+			case '<':
+			case '>':
+			case '=':
+			case '&':
+			case '|':
+			case '^':
+			case '~':
+				return ch;
+			default:
+				break;
+		}
+	}
+
+	if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
+	    ch == '>' || ch == '<' || /* <<, >>, <=, >= */
+	    ch == '=' ||              /* == */
+	    ch == '&' || ch == '|')   /* &&, ||, &=, |= */
+	{
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+
+		nextch = lex_getch(lex);
+		if (nextch == ch || nextch == '=') {
+			if (!lex_tokench(lex, nextch))
+				return (lex->tok->ttype = TOKEN_FATAL);
+		} else if (ch == '-' && nextch == '>') {
+			if (!lex_tokench(lex, nextch))
+				return (lex->tok->ttype = TOKEN_FATAL);
+		} else
+			lex_ungetch(lex, nextch);
+
+		if (!lex_endtoken(lex))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		return (lex->tok->ttype = TOKEN_OPERATOR);
+	}
+
+	if (ch == '^' || ch == '~' || ch == '!')
+	{
+		if (!lex_tokench(lex, ch) ||
+			!lex_endtoken(lex))
+		{
+			return (lex->tok->ttype = TOKEN_FATAL);
+		}
+		return (lex->tok->ttype = TOKEN_OPERATOR);
+	}
+
+	if (ch == '*' || ch == '/') /* *=, /= */
+	{
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+
+		nextch = lex_getch(lex);
+		if (nextch == '=') {
+			if (!lex_tokench(lex, nextch))
+				return (lex->tok->ttype = TOKEN_FATAL);
+		} else
+			lex_ungetch(lex, nextch);
+
+		if (!lex_endtoken(lex))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		return (lex->tok->ttype = TOKEN_OPERATOR);
+	}
+
+	if (isident_start(ch))
+	{
+		const char *v;
+		if (!lex_tokench(lex, ch))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		if (!lex_finish_ident(lex)) {
+			/* error? */
+			return (lex->tok->ttype = TOKEN_ERROR);
+		}
+		if (!lex_endtoken(lex))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		lex->tok->ttype = TOKEN_IDENT;
+
+		v = lex->tok->value;
+		if (!strcmp(v, "void") ||
+		    !strcmp(v, "int") ||
+		    !strcmp(v, "float") ||
+		    !strcmp(v, "vector") )
+		{
+			lex->tok->ttype = TOKEN_TYPENAME;
+			switch (v[1]) {
+				case 'o': lex->tok->constval.t = TYPE_VOID;    break;
+				case 'n': lex->tok->constval.t = TYPE_INTEGER; break;
+				case 'l': lex->tok->constval.t = TYPE_FLOAT;   break;
+				case 'e': lex->tok->constval.t = TYPE_VECTOR;  break;
+			}
+		}
+		else if (!strcmp(v, "for") ||
+		         !strcmp(v, "while") ||
+		         !strcmp(v, "do"))
+			lex->tok->ttype = TOKEN_KEYWORD;
+
+		return lex->tok->ttype;
+	}
+
+	if (ch == '"')
+	{
+		lex->tok->ttype = lex_finish_string(lex, '"');
+		while (lex->tok->ttype == TOKEN_STRINGCONST)
+		{
+			/* Allow c style "string" "continuation" */
+			ch = lex_skipwhite(lex);
+			if (ch != '"') {
+				lex_ungetch(lex, ch);
+				break;
+			}
+
+			lex->tok->ttype = lex_finish_string(lex, '"');
+		}
+		if (!lex_endtoken(lex))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		return lex->tok->ttype;
+	}
+
+	if (ch == '\'')
+	{
+		/* we parse character constants like string,
+		 * but return TOKEN_CHARCONST, or a vector type if it fits...
+		 * Likewise actual unescaping has to be done by the parser.
+		 * The difference is we don't allow 'char' 'continuation'.
+		 */
+		 lex->tok->ttype = lex_finish_string(lex, '\'');
+		 if (!lex_endtoken(lex))
+		 	 return (lex->tok->ttype = TOKEN_FATAL);
+
+		 /* It's a vector if we can successfully scan 3 floats */
+		 if (sscanf(lex->tok->value, " %f %f %f ", &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
+		 {
+		 	 lex->tok->ttype = TOKEN_VECTORCONST;
+		 }
+
+		 return lex->tok->ttype;
+	}
+
+	if (isdigit(ch))
+	{
+		lex->tok->ttype = lex_finish_digit(lex, ch);
+		if (!lex_endtoken(lex))
+			return (lex->tok->ttype = TOKEN_FATAL);
+		return lex->tok->ttype;
+	}
+
+	lexerror(lex, "unknown token");
+	return (lex->tok->ttype = TOKEN_ERROR);
+}
--- a/lexer.h
+++ b/lexer.h
@ -0,0 +1,207 @@
+#ifndef GMQCC_LEXER_HDR_
+#define GMQCC_LEXER_HDR_
+
+typedef struct token_s token;
+
+#include "ast.h"
+
+struct token_s {
+	int ttype;
+
+	MEM_VECTOR_MAKE(char, value);
+
+	union {
+		vector v;
+		int    i;
+		double f;
+		int    t; /* type */
+	} constval;
+
+	struct token_s *next;
+	struct token_s *prev;
+
+	lex_ctx ctx;
+};
+
+token* token_new();
+void   token_delete(token*);
+token* token_copy(const token *cp);
+void   token_delete_all(token *t);
+token* token_copy_all(const token *cp);
+
+/* Lexer
+ *
+ */
+enum {
+    /* Other tokens which we can return: */
+    TOKEN_NONE = 0,
+    TOKEN_START = 128,
+
+    TOKEN_IDENT,
+
+    TOKEN_TYPENAME,
+
+    TOKEN_OPERATOR,
+
+    TOKEN_KEYWORD, /* loop */
+
+    TOKEN_STRINGCONST, /* not the typename but an actual "string" */
+    TOKEN_CHARCONST,
+    TOKEN_VECTORCONST,
+    TOKEN_INTCONST,
+    TOKEN_FLOATCONST,
+
+    TOKEN_EOF,
+
+    /* We use '< TOKEN_ERROR', so TOKEN_FATAL must come after it and any
+     * other error related tokens as well
+     */
+    TOKEN_ERROR,
+    TOKEN_FATAL /* internal error, eg out of memory */
+};
+
+static const char *_tokennames[] = {
+    "TOKEN_START",
+    "TOKEN_IDENT",
+    "TOKEN_TYPENAME",
+    "TOKEN_OPERATOR",
+    "TOKEN_KEYWORD",
+    "TOKEN_STRINGCONST",
+    "TOKEN_CHARCONST",
+    "TOKEN_VECTORCONST",
+    "TOKEN_INTCONST",
+    "TOKEN_FLOATCONST",
+    "TOKEN_EOF",
+    "TOKEN_ERROR",
+    "TOKEN_FATAL",
+};
+typedef int
+_all_tokennames_added_[
+	((TOKEN_FATAL - TOKEN_START + 1) ==
+	 (sizeof(_tokennames)/sizeof(_tokennames[0])))
+	? 1 : -1];
+
+typedef struct {
+	FILE   *file;
+	char   *name;
+	size_t  line;
+	size_t  sline; /* line at the start of a token */
+
+	char    peek[256];
+	size_t  peekpos;
+
+	token  *tok;
+
+	struct {
+	    bool noops;
+	} flags;
+} lex_file;
+
+MEM_VECTOR_PROTO(lex_file, char, token);
+
+lex_file* lex_open (const char *file);
+void      lex_close(lex_file   *lex);
+int       lex_do   (lex_file   *lex);
+
+/* Parser
+ *
+ */
+
+enum {
+    ASSOC_LEFT,
+    ASSOC_RIGHT
+};
+
+#define OP_SUFFIX 1
+#define OP_PREFIX 2
+
+typedef struct {
+    const char   *op;
+    unsigned int assoc;
+    unsigned int prec;
+    unsigned int flags;
+} oper_info;
+
+static const oper_info operators[] = {
+    { "++",  ASSOC_LEFT,  16, OP_SUFFIX},
+    { "--",  ASSOC_LEFT,  16, OP_SUFFIX},
+
+    { ".",   ASSOC_LEFT,  15, 0 },
+
+    { "!",   ASSOC_RIGHT, 14, 0 },
+    { "~",   ASSOC_RIGHT, 14, 0 },
+    { "+",   ASSOC_RIGHT, 14, OP_PREFIX },
+    { "-",   ASSOC_RIGHT, 14, OP_PREFIX },
+    { "++",  ASSOC_RIGHT, 14, OP_PREFIX },
+    { "--",  ASSOC_RIGHT, 14, OP_PREFIX },
+/*  { "&",   ASSOC_RIGHT, 14, OP_PREFIX }, */
+
+    { "*",   ASSOC_LEFT,  13, 0 },
+    { "/",   ASSOC_LEFT,  13, 0 },
+    { "%",   ASSOC_LEFT,  13, 0 },
+
+    { "+",   ASSOC_LEFT,  12, 0 },
+    { "-",   ASSOC_LEFT,  12, 0 },
+
+    { "<<",  ASSOC_LEFT,  11, 0 },
+    { ">>",  ASSOC_LEFT,  11, 0 },
+
+    { "<",   ASSOC_LEFT,  10, 0 },
+    { ">",   ASSOC_LEFT,  10, 0 },
+    { "<=",  ASSOC_LEFT,  10, 0 },
+    { ">=",  ASSOC_LEFT,  10, 0 },
+
+    { "==",  ASSOC_LEFT,  9,  0 },
+    { "!=",  ASSOC_LEFT,  9,  0 },
+
+    { "&",   ASSOC_LEFT,  8,  0 },
+
+    { "^",   ASSOC_LEFT,  7,  0 },
+
+    { "|",   ASSOC_LEFT,  6,  0 },
+
+    { "&&",  ASSOC_LEFT,  5,  0 },
+
+    { "||",  ASSOC_LEFT,  4,  0 },
+
+    { "?",   ASSOC_RIGHT, 3,  0 },
+
+    { "=",   ASSOC_RIGHT, 2,  0 },
+    { "+=",  ASSOC_RIGHT, 2,  0 },
+    { "-=",  ASSOC_RIGHT, 2,  0 },
+    { "*=",  ASSOC_RIGHT, 2,  0 },
+    { "/=",  ASSOC_RIGHT, 2,  0 },
+    { "%=",  ASSOC_RIGHT, 2,  0 },
+    { ">>=", ASSOC_RIGHT, 2,  0 },
+    { "<<=", ASSOC_RIGHT, 2,  0 },
+    { "&=",  ASSOC_RIGHT, 2,  0 },
+    { "^=",  ASSOC_RIGHT, 2,  0 },
+    { "|=",  ASSOC_RIGHT, 2,  0 },
+};
+
+typedef struct
+{
+	lex_file *lex;
+	int      error;
+	lex_ctx  ctx;
+
+	token    *tokens;
+	token    *lastok;
+
+	token    *tok; /* current token */
+
+	MEM_VECTOR_MAKE(ast_value*, globals);
+} parse_file;
+
+MEM_VECTOR_PROTO(parse_file, ast_value*, globals);
+
+parse_file* parse_open(const char *file);
+void        parse_file_close(parse_file*);
+
+bool        parse(parse_file*);
+
+bool        parse_iskey(parse_file *self, const char *ident);
+
+void lexerror(lex_file*, const char *fmt, ...);
+
+#endif
--- a/main.c
+++ b/main.c
@ -21,19 +21,18 @@
 * SOFTWARE.
 */
 #include "gmqcc.h"
-typedef struct { char *name, type; } argitem;
-VECTOR_MAKE(argitem, items);

+bool parser_compile(const char *filename);
 int main(int argc, char **argv) {
-    size_t itr;
-
    util_debug("COM", "starting ...\n");

+    if (argc == 2) {
+        if (!parser_compile(argv[1])) {
+            printf("There were compile errors\n");
+        }
+    }
+
    util_debug("COM", "cleaning ...\n");
-    /* clean list */
-    for (itr = 0; itr < items_elements; itr++)
-        mem_d(items_data[itr].name);
-    mem_d(items_data);

    util_meminfo();
    return 0;
--- a/parser.c
+++ b/parser.c
@ -0,0 +1,7 @@
+#include "gmqcc.h"
+#include "lexer.h"
+
+bool parser_compile(const char *filename)
+{
+    return false;
+}