From 8bbc512b14b3a02af1ad42556763bcf5e9e6aece Mon Sep 17 00:00:00 2001 From: Lactozilla Date: Fri, 19 Jan 2024 16:40:32 -0300 Subject: [PATCH] Tokenizer changes: Capture '"' characters Capture ';' tokens Implement tracking of the current line --- src/m_tokenizer.c | 79 ++++++++++++++++++++++++++++++++++------------- src/m_tokenizer.h | 2 ++ 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/src/m_tokenizer.c b/src/m_tokenizer.c index 26275881d..876dc6b20 100644 --- a/src/m_tokenizer.c +++ b/src/m_tokenizer.c @@ -21,6 +21,7 @@ tokenizer_t *Tokenizer_Open(const char *inputString, unsigned numTokens) tokenizer->endPos = 0; tokenizer->inputLength = 0; tokenizer->inComment = 0; + tokenizer->inString = 0; tokenizer->get = Tokenizer_Read; if (numTokens < 1) @@ -53,7 +54,18 @@ void Tokenizer_Close(tokenizer_t *tokenizer) Z_Free(tokenizer); } -static void Tokenizer_DetectComment(tokenizer_t *tokenizer, UINT32 *pos) +static boolean DetectLineBreak(tokenizer_t *tokenizer, size_t pos) +{ + if (tokenizer->input[pos] == '\n') + { + tokenizer->line++; + return true; + } + + return false; +} + +static void DetectComment(tokenizer_t *tokenizer, UINT32 *pos) { if (tokenizer->inComment) return; @@ -94,8 +106,31 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i) tokenizer->startPos = tokenizer->endPos; + // If in a string, return the entire string within quotes, except without the quotes. + if (tokenizer->inString == 1) + { + while (tokenizer->input[tokenizer->endPos] != '"' && tokenizer->endPos < tokenizer->inputLength) + { + DetectLineBreak(tokenizer, tokenizer->endPos); + tokenizer->endPos++; + } + + Tokenizer_ReadTokenString(tokenizer, i); + tokenizer->inString = 2; + return tokenizer->token[i]; + } + // If just ended a string, return only a quotation mark. + else if (tokenizer->inString == 2) + { + tokenizer->endPos = tokenizer->startPos + 1; + tokenizer->token[i][0] = tokenizer->input[tokenizer->startPos]; + tokenizer->token[i][1] = '\0'; + tokenizer->inString = 0; + return tokenizer->token[i]; + } + // Try to detect comments now, in case we're pointing right at one - Tokenizer_DetectComment(tokenizer, &tokenizer->startPos); + DetectComment(tokenizer, &tokenizer->startPos); // Find the first non-whitespace char, or else the end of the string trying while ((tokenizer->input[tokenizer->startPos] == ' ' @@ -106,8 +141,10 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i) || tokenizer->inComment != 0) && tokenizer->startPos < tokenizer->inputLength) { + boolean inLineBreak = DetectLineBreak(tokenizer, tokenizer->startPos); + // Try to detect comment endings now - if (tokenizer->inComment == 1 && tokenizer->input[tokenizer->startPos] == '\n') + if (tokenizer->inComment == 1 && inLineBreak) tokenizer->inComment = 0; // End of line for a single-line comment else if (tokenizer->inComment == 2 && tokenizer->startPos < tokenizer->inputLength - 1 @@ -120,11 +157,12 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i) } tokenizer->startPos++; - Tokenizer_DetectComment(tokenizer, &tokenizer->startPos); + DetectComment(tokenizer, &tokenizer->startPos); } // If the end of the string is reached, no token is to be read - if (tokenizer->startPos == tokenizer->inputLength) { + if (tokenizer->startPos == tokenizer->inputLength) + { tokenizer->endPos = tokenizer->inputLength; return NULL; } @@ -136,22 +174,15 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i) || tokenizer->input[tokenizer->startPos] == ']' || tokenizer->input[tokenizer->startPos] == '=' || tokenizer->input[tokenizer->startPos] == ':' - || tokenizer->input[tokenizer->startPos] == '%') + || tokenizer->input[tokenizer->startPos] == '%' + || tokenizer->input[tokenizer->startPos] == '"') { tokenizer->endPos = tokenizer->startPos + 1; tokenizer->token[i][0] = tokenizer->input[tokenizer->startPos]; tokenizer->token[i][1] = '\0'; - return tokenizer->token[i]; - } - // Return entire string within quotes, except without the quotes. - else if (tokenizer->input[tokenizer->startPos] == '"') - { - tokenizer->endPos = ++tokenizer->startPos; - while (tokenizer->input[tokenizer->endPos] != '"' && tokenizer->endPos < tokenizer->inputLength) - tokenizer->endPos++; + if (tokenizer->input[tokenizer->startPos] == '"') + tokenizer->inString = 1; - Tokenizer_ReadTokenString(tokenizer, i); - tokenizer->endPos++; return tokenizer->token[i]; } @@ -169,12 +200,13 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i) && tokenizer->input[tokenizer->endPos] != '=' && tokenizer->input[tokenizer->endPos] != ':' && tokenizer->input[tokenizer->endPos] != '%' + && tokenizer->input[tokenizer->endPos] != ';' && tokenizer->inComment == 0) && tokenizer->endPos < tokenizer->inputLength) { tokenizer->endPos++; // Try to detect comment starts now; if it's in a comment, we don't want it in this token - Tokenizer_DetectComment(tokenizer, &tokenizer->endPos); + DetectComment(tokenizer, &tokenizer->endPos); } Tokenizer_ReadTokenString(tokenizer, i); @@ -189,7 +221,7 @@ const char *Tokenizer_SRB2Read(tokenizer_t *tokenizer, UINT32 i) tokenizer->startPos = tokenizer->endPos; // Try to detect comments now, in case we're pointing right at one - Tokenizer_DetectComment(tokenizer, &tokenizer->startPos); + DetectComment(tokenizer, &tokenizer->startPos); // Find the first non-whitespace char, or else the end of the string trying while ((tokenizer->input[tokenizer->startPos] == ' ' @@ -201,8 +233,10 @@ const char *Tokenizer_SRB2Read(tokenizer_t *tokenizer, UINT32 i) || tokenizer->inComment != 0) && tokenizer->startPos < tokenizer->inputLength) { + boolean inLineBreak = DetectLineBreak(tokenizer, tokenizer->startPos); + // Try to detect comment endings now - if (tokenizer->inComment == 1 && tokenizer->input[tokenizer->startPos] == '\n') + if (tokenizer->inComment == 1 && inLineBreak) tokenizer->inComment = 0; // End of line for a single-line comment else if (tokenizer->inComment == 2 && tokenizer->startPos < tokenizer->inputLength - 1 @@ -215,7 +249,7 @@ const char *Tokenizer_SRB2Read(tokenizer_t *tokenizer, UINT32 i) } tokenizer->startPos++; - Tokenizer_DetectComment(tokenizer, &tokenizer->startPos); + DetectComment(tokenizer, &tokenizer->startPos); } // If the end of the string is reached, no token is to be read @@ -238,7 +272,10 @@ const char *Tokenizer_SRB2Read(tokenizer_t *tokenizer, UINT32 i) { tokenizer->endPos = ++tokenizer->startPos; while (tokenizer->input[tokenizer->endPos] != '"' && tokenizer->endPos < tokenizer->inputLength) + { + DetectLineBreak(tokenizer, tokenizer->endPos); tokenizer->endPos++; + } Tokenizer_ReadTokenString(tokenizer, i); tokenizer->endPos++; @@ -260,7 +297,7 @@ const char *Tokenizer_SRB2Read(tokenizer_t *tokenizer, UINT32 i) { tokenizer->endPos++; // Try to detect comment starts now; if it's in a comment, we don't want it in this token - Tokenizer_DetectComment(tokenizer, &tokenizer->endPos); + DetectComment(tokenizer, &tokenizer->endPos); } Tokenizer_ReadTokenString(tokenizer, i); diff --git a/src/m_tokenizer.h b/src/m_tokenizer.h index 88cb2a566..f51117301 100644 --- a/src/m_tokenizer.h +++ b/src/m_tokenizer.h @@ -24,6 +24,8 @@ typedef struct Tokenizer UINT32 endPos; UINT32 inputLength; UINT8 inComment; // 0 = not in comment, 1 = // Single-line, 2 = /* Multi-line */ + UINT8 inString; // 0 = not in string, 1 = in string, 2 = just left string + int line; const char *(*get)(struct Tokenizer*, UINT32); } tokenizer_t;