2023-10-30 17:54:32 +00:00
// SONIC ROBO BLAST 2
//-----------------------------------------------------------------------------
// Copyright (C) 2013-2023 by Sonic Team Junior.
//
// This program is free software distributed under the
// terms of the GNU General Public License, version 2.
// See the 'LICENSE' file for more details.
//-----------------------------------------------------------------------------
/// \file m_tokenizer.c
/// \brief Tokenizer
# include "m_tokenizer.h"
# include "z_zone.h"
tokenizer_t * Tokenizer_Open ( const char * inputString , unsigned numTokens )
{
tokenizer_t * tokenizer = Z_Malloc ( sizeof ( tokenizer_t ) , PU_STATIC , NULL ) ;
tokenizer - > input = inputString ;
tokenizer - > startPos = 0 ;
tokenizer - > endPos = 0 ;
tokenizer - > inputLength = 0 ;
tokenizer - > inComment = 0 ;
2024-01-19 19:40:32 +00:00
tokenizer - > inString = 0 ;
2023-10-30 17:54:32 +00:00
tokenizer - > get = Tokenizer_Read ;
if ( numTokens < 1 )
numTokens = 1 ;
tokenizer - > numTokens = numTokens ;
tokenizer - > capacity = Z_Malloc ( sizeof ( UINT32 ) * numTokens , PU_STATIC , NULL ) ;
tokenizer - > token = Z_Malloc ( sizeof ( char * ) * numTokens , PU_STATIC , NULL ) ;
for ( size_t i = 0 ; i < numTokens ; i + + )
{
tokenizer - > capacity [ i ] = 1024 ;
tokenizer - > token [ i ] = ( char * ) Z_Malloc ( tokenizer - > capacity [ i ] * sizeof ( char ) , PU_STATIC , NULL ) ;
}
tokenizer - > inputLength = strlen ( tokenizer - > input ) ;
return tokenizer ;
}
void Tokenizer_Close ( tokenizer_t * tokenizer )
{
if ( ! tokenizer )
return ;
for ( size_t i = 0 ; i < tokenizer - > numTokens ; i + + )
Z_Free ( tokenizer - > token [ i ] ) ;
Z_Free ( tokenizer - > capacity ) ;
Z_Free ( tokenizer - > token ) ;
Z_Free ( tokenizer ) ;
}
2024-01-19 19:40:32 +00:00
static boolean DetectLineBreak ( tokenizer_t * tokenizer , size_t pos )
{
if ( tokenizer - > input [ pos ] = = ' \n ' )
{
tokenizer - > line + + ;
return true ;
}
return false ;
}
static void DetectComment ( tokenizer_t * tokenizer , UINT32 * pos )
2023-10-30 17:54:32 +00:00
{
if ( tokenizer - > inComment )
return ;
if ( * pos > = tokenizer - > inputLength - 1 )
return ;
if ( tokenizer - > input [ * pos ] ! = ' / ' )
return ;
// Single-line comment start
if ( tokenizer - > input [ * pos + 1 ] = = ' / ' )
tokenizer - > inComment = 1 ;
// Multi-line comment start
else if ( tokenizer - > input [ * pos + 1 ] = = ' * ' )
tokenizer - > inComment = 2 ;
}
static void Tokenizer_ReadTokenString ( tokenizer_t * tokenizer , UINT32 i )
{
UINT32 tokenLength = tokenizer - > endPos - tokenizer - > startPos ;
if ( tokenLength + 1 > tokenizer - > capacity [ i ] )
{
tokenizer - > capacity [ i ] = tokenLength + 1 ;
// Assign the memory. Don't forget an extra byte for the end of the string!
tokenizer - > token [ i ] = ( char * ) Z_Malloc ( tokenizer - > capacity [ i ] * sizeof ( char ) , PU_STATIC , NULL ) ;
}
// Copy the string.
M_Memcpy ( tokenizer - > token [ i ] , tokenizer - > input + tokenizer - > startPos , ( size_t ) tokenLength ) ;
// Make the final character NUL.
tokenizer - > token [ i ] [ tokenLength ] = ' \0 ' ;
}
const char * Tokenizer_Read ( tokenizer_t * tokenizer , UINT32 i )
{
if ( ! tokenizer - > input )
return NULL ;
tokenizer - > startPos = tokenizer - > endPos ;
2024-01-19 19:40:32 +00:00
// If in a string, return the entire string within quotes, except without the quotes.
if ( tokenizer - > inString = = 1 )
{
while ( tokenizer - > input [ tokenizer - > endPos ] ! = ' " ' & & tokenizer - > endPos < tokenizer - > inputLength )
{
DetectLineBreak ( tokenizer , tokenizer - > endPos ) ;
tokenizer - > endPos + + ;
}
Tokenizer_ReadTokenString ( tokenizer , i ) ;
tokenizer - > inString = 2 ;
return tokenizer - > token [ i ] ;
}
// If just ended a string, return only a quotation mark.
else if ( tokenizer - > inString = = 2 )
{
tokenizer - > endPos = tokenizer - > startPos + 1 ;
tokenizer - > token [ i ] [ 0 ] = tokenizer - > input [ tokenizer - > startPos ] ;
tokenizer - > token [ i ] [ 1 ] = ' \0 ' ;
tokenizer - > inString = 0 ;
return tokenizer - > token [ i ] ;
}
2023-10-30 17:54:32 +00:00
// Try to detect comments now, in case we're pointing right at one
2024-01-19 19:40:32 +00:00
DetectComment ( tokenizer , & tokenizer - > startPos ) ;
2023-10-30 17:54:32 +00:00
// Find the first non-whitespace char, or else the end of the string trying
while ( ( tokenizer - > input [ tokenizer - > startPos ] = = ' '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \t '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \r '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \n '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \0 '
| | tokenizer - > inComment ! = 0 )
& & tokenizer - > startPos < tokenizer - > inputLength )
{
2024-01-19 19:40:32 +00:00
boolean inLineBreak = DetectLineBreak ( tokenizer , tokenizer - > startPos ) ;
2023-10-30 17:54:32 +00:00
// Try to detect comment endings now
2024-01-19 19:40:32 +00:00
if ( tokenizer - > inComment = = 1 & & inLineBreak )
2023-10-30 17:54:32 +00:00
tokenizer - > inComment = 0 ; // End of line for a single-line comment
else if ( tokenizer - > inComment = = 2
& & tokenizer - > startPos < tokenizer - > inputLength - 1
& & tokenizer - > input [ tokenizer - > startPos ] = = ' * '
& & tokenizer - > input [ tokenizer - > startPos + 1 ] = = ' / ' )
{
// End of multi-line comment
tokenizer - > inComment = 0 ;
tokenizer - > startPos + + ; // Make damn well sure we're out of the comment ending at the end of it all
}
tokenizer - > startPos + + ;
2024-01-19 19:40:32 +00:00
DetectComment ( tokenizer , & tokenizer - > startPos ) ;
2023-10-30 17:54:32 +00:00
}
// If the end of the string is reached, no token is to be read
2024-01-19 19:40:32 +00:00
if ( tokenizer - > startPos = = tokenizer - > inputLength )
{
2023-10-30 17:54:32 +00:00
tokenizer - > endPos = tokenizer - > inputLength ;
return NULL ;
}
// Else, if it's one of these three symbols, capture only this one character
else if ( tokenizer - > input [ tokenizer - > startPos ] = = ' , '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' { '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' } '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' [ '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' ] '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' = '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' : '
2024-01-19 19:40:32 +00:00
| | tokenizer - > input [ tokenizer - > startPos ] = = ' % '
2024-01-19 23:39:28 +00:00
| | tokenizer - > input [ tokenizer - > startPos ] = = ' @ '
2024-01-19 19:40:32 +00:00
| | tokenizer - > input [ tokenizer - > startPos ] = = ' " ' )
2023-10-30 17:54:32 +00:00
{
tokenizer - > endPos = tokenizer - > startPos + 1 ;
tokenizer - > token [ i ] [ 0 ] = tokenizer - > input [ tokenizer - > startPos ] ;
tokenizer - > token [ i ] [ 1 ] = ' \0 ' ;
2024-01-19 19:40:32 +00:00
if ( tokenizer - > input [ tokenizer - > startPos ] = = ' " ' )
tokenizer - > inString = 1 ;
2023-10-30 17:54:32 +00:00
return tokenizer - > token [ i ] ;
}
// Now find the end of the token. This includes several additional characters that are okay to capture as one character, but not trailing at the end of another token.
tokenizer - > endPos = tokenizer - > startPos + 1 ;
while ( ( tokenizer - > input [ tokenizer - > endPos ] ! = ' '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' \t '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' \r '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' \n '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' , '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' { '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' } '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' [ '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' ] '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' = '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' : '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' % '
2024-01-19 23:39:28 +00:00
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' @ '
2024-01-19 19:40:32 +00:00
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' ; '
2023-10-30 17:54:32 +00:00
& & tokenizer - > inComment = = 0 )
& & tokenizer - > endPos < tokenizer - > inputLength )
{
tokenizer - > endPos + + ;
// Try to detect comment starts now; if it's in a comment, we don't want it in this token
2024-01-19 19:40:32 +00:00
DetectComment ( tokenizer , & tokenizer - > endPos ) ;
2023-10-30 17:54:32 +00:00
}
Tokenizer_ReadTokenString ( tokenizer , i ) ;
return tokenizer - > token [ i ] ;
}
const char * Tokenizer_SRB2Read ( tokenizer_t * tokenizer , UINT32 i )
{
if ( ! tokenizer - > input )
return NULL ;
tokenizer - > startPos = tokenizer - > endPos ;
// Try to detect comments now, in case we're pointing right at one
2024-01-19 19:40:32 +00:00
DetectComment ( tokenizer , & tokenizer - > startPos ) ;
2023-10-30 17:54:32 +00:00
// Find the first non-whitespace char, or else the end of the string trying
while ( ( tokenizer - > input [ tokenizer - > startPos ] = = ' '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \t '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \r '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \n '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' \0 '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' = ' | | tokenizer - > input [ tokenizer - > startPos ] = = ' ; ' // UDMF TEXTMAP.
| | tokenizer - > inComment ! = 0 )
& & tokenizer - > startPos < tokenizer - > inputLength )
{
2024-01-19 19:40:32 +00:00
boolean inLineBreak = DetectLineBreak ( tokenizer , tokenizer - > startPos ) ;
2023-10-30 17:54:32 +00:00
// Try to detect comment endings now
2024-01-19 19:40:32 +00:00
if ( tokenizer - > inComment = = 1 & & inLineBreak )
2023-10-30 17:54:32 +00:00
tokenizer - > inComment = 0 ; // End of line for a single-line comment
else if ( tokenizer - > inComment = = 2
& & tokenizer - > startPos < tokenizer - > inputLength - 1
& & tokenizer - > input [ tokenizer - > startPos ] = = ' * '
& & tokenizer - > input [ tokenizer - > startPos + 1 ] = = ' / ' )
{
// End of multi-line comment
tokenizer - > inComment = 0 ;
tokenizer - > startPos + + ; // Make damn well sure we're out of the comment ending at the end of it all
}
tokenizer - > startPos + + ;
2024-01-19 19:40:32 +00:00
DetectComment ( tokenizer , & tokenizer - > startPos ) ;
2023-10-30 17:54:32 +00:00
}
// If the end of the string is reached, no token is to be read
if ( tokenizer - > startPos = = tokenizer - > inputLength ) {
tokenizer - > endPos = tokenizer - > inputLength ;
return NULL ;
}
// Else, if it's one of these three symbols, capture only this one character
else if ( tokenizer - > input [ tokenizer - > startPos ] = = ' , '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' { '
| | tokenizer - > input [ tokenizer - > startPos ] = = ' } ' )
{
tokenizer - > endPos = tokenizer - > startPos + 1 ;
tokenizer - > token [ i ] [ 0 ] = tokenizer - > input [ tokenizer - > startPos ] ;
tokenizer - > token [ i ] [ 1 ] = ' \0 ' ;
return tokenizer - > token [ i ] ;
}
// Return entire string within quotes, except without the quotes.
else if ( tokenizer - > input [ tokenizer - > startPos ] = = ' " ' )
{
tokenizer - > endPos = + + tokenizer - > startPos ;
while ( tokenizer - > input [ tokenizer - > endPos ] ! = ' " ' & & tokenizer - > endPos < tokenizer - > inputLength )
2024-01-19 19:40:32 +00:00
{
DetectLineBreak ( tokenizer , tokenizer - > endPos ) ;
2023-10-30 17:54:32 +00:00
tokenizer - > endPos + + ;
2024-01-19 19:40:32 +00:00
}
2023-10-30 17:54:32 +00:00
Tokenizer_ReadTokenString ( tokenizer , i ) ;
tokenizer - > endPos + + ;
return tokenizer - > token [ i ] ;
}
// Now find the end of the token. This includes several additional characters that are okay to capture as one character, but not trailing at the end of another token.
tokenizer - > endPos = tokenizer - > startPos + 1 ;
while ( ( tokenizer - > input [ tokenizer - > endPos ] ! = ' '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' \t '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' \r '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' \n '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' , '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' { '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' } '
& & tokenizer - > input [ tokenizer - > endPos ] ! = ' = ' & & tokenizer - > input [ tokenizer - > endPos ] ! = ' ; ' // UDMF TEXTMAP.
& & tokenizer - > inComment = = 0 )
& & tokenizer - > endPos < tokenizer - > inputLength )
{
tokenizer - > endPos + + ;
// Try to detect comment starts now; if it's in a comment, we don't want it in this token
2024-01-19 19:40:32 +00:00
DetectComment ( tokenizer , & tokenizer - > endPos ) ;
2023-10-30 17:54:32 +00:00
}
Tokenizer_ReadTokenString ( tokenizer , i ) ;
return tokenizer - > token [ i ] ;
}
UINT32 Tokenizer_GetEndPos ( tokenizer_t * tokenizer )
{
return tokenizer - > endPos ;
}
void Tokenizer_SetEndPos ( tokenizer_t * tokenizer , UINT32 newPos )
{
tokenizer - > endPos = newPos ;
}