Skip to content

Instantly share code, notes, and snippets.

@remy-j-a-moueza
Created March 10, 2018 18:24
Show Gist options
  • Save remy-j-a-moueza/8909819cbf972430bfbb16dff768b97d to your computer and use it in GitHub Desktop.
Save remy-j-a-moueza/8909819cbf972430bfbb16dff768b97d to your computer and use it in GitHub Desktop.
An old lexer for a subset of C++ made with D1 around 2007.
/** Simple pair like structure for managing tokens. */
struct Token
{
public :
int type = -1 ;
char [] value = "" ;
int line ;
/** To ease construction. */
static Token opCall ( int type, char [] value, int line = 0 )
{
Token result ;
result.type = type ;
result.value = value ;
result.line = line ;
return result ;
}
char [] toString ()
{
return str.format ( "(", type, ": '", value, "')" );
}
}
class Lexer
{
protected :
char [] state = "braced" ; // Tells about the level of "braced" token we're looking for.
public :
char [] text = "" ;
Token [] slices ; // The text splitted.
int iterator ; // Iterator over the slices.
Class currentNamespace ; // The class, struct or union wherein we add definitions.
Token currentToken ; // the current token, used for line info.
Node [] root ; // The top node once the analysis is finished.
Class [ char []] classes ; // A minimal symbol table for classes.
Typedef [ char []] typedefs ;
this ( char [] content )
{
this.text = content ;
// Prepare the text: it should be well separated.
char [][ char []] dico ;
foreach ( item ; "[](),;:*&{}=~" )
{
char [] lexem = "" ~ item ;
dico [ lexem ] = " " ~ lexem ~ " " ;
}
text = re.sub ( text, r"([\w_\d]+)([^\w\d_]+)", r"$1 $2" );
text = re.sub ( text, r"([^\w_\d]+)([\w\d_]+)", r"$1 $2" );
foreach ( key, value ; dico )
text = str.replace ( text, key, value );
auto lines = str.splitlines ( text );
foreach ( index, line ; lines )
foreach ( item ; str.split ( line ))
this.slices ~= Token ( 0, item, index +1 );
}
Token next ()
{
Token result ;
// Do not get out of slices bounds.
if ( iterator >= slices.length )
return result ;
result = slices [ iterator ++ ];
char [] chunk = result.value ;
// debug cout << iterator << ", chunk: " << chunk ~ \n ;
result.type = 0 ;
// Identifier.
if ( auto search = re.search ( chunk, r"[\w_^\d][\w\d_]*" ))
{
switch ( search.match (0))
{
case "typedef" : result.type = tokens.TYPEDEF ; break ;
case "const" : result.type = tokens.CONST ; break ;
case "virtual" : result.type = tokens.VIRTUAL ; break ;
case "throw" : result.type = tokens.THROW ; break ;
case "class" :
case "struct" :
case "union" :
{
result.type = tokens.CLASS ;
this.state = "" ;
break ;
}
case "private" : result.type = tokens.PRIVATE ; break ;
case "protected" : result.type = tokens.PROTECTED ; break ;
case "public" : result.type = tokens.PUBLIC ; break ;
case "enum" :
{
result.type = tokens.ENUM ;
this.state = "braced" ;
break ;
}
case "template": result.type = tokens.TEMPLATE ; break ;
case "typename": result.type = tokens.TYPENAME ; break ;
case "inline" : result.type = tokens.INLINE ; break ;
case "static" : result.type = tokens.STATIC ; break ;
case "register": result.type = tokens.REGISTER ; break ;
case "volatile": result.type = tokens.REGISTER ; break ;
case "unsigned":
{
result.type = tokens.IDENTIFIER ;
if ( iterator <= slices.length )
{
Token next = slices [ iterator ];
switch ( next.value )
{
case "char" :
case "short":
case "int" :
result.type = tokens.BASICTYPE ;
result.value ~= " " ~ next.value ;
++ iterator ;
break ;
case "long" :
result.type = tokens.BASICTYPE ;
result.value ~= " " ~ next.value ;
++ iterator ;
Token lookAhead = slices [ iterator ] ;
if ( lookAhead.value == "long" ) // unsigned long long.
{
result.value ~= " " ~ lookAhead.value ;
++ iterator ;
}
break ;
default: {}
}
}
break ;
}
case "long":
{
result.type = tokens.IDENTIFIER ;
if ( iterator <= slices.length )
{
Token next = slices [ iterator ];
switch ( next.value )
{
case "int" :
case "long":
case "double" :
result.type = tokens.BASICTYPE ;
result.value ~= " " ~ next.value ;
++ iterator ;
break ;
default: {}
}
}
break ;
}
case "signed":
{
result.type = tokens.IDENTIFIER ;
if ( iterator <= slices.length )
{
Token next = slices [ iterator ];
switch ( next.value )
{
case "char" :
case "double" :
case "int" :
case "long" :
case "short" :
result.type = tokens.BASICTYPE ;
result.value ~= " " ~ next.value ;
++ iterator ;
break ;
default: {}
}
}
break ;
}
case "friend" : result.type = tokens.FRIEND ; break ;
case "extern" : result.type = tokens.EXTERN ; break ;
case "0" : result.type = tokens.ZERO ; break ;
default : result.type = tokens.IDENTIFIER ;
}
}
else if ( auto search = re.search ( chunk, r"\d+" ))
{
result.type = tokens.NUMBER ;
}
else
{
switch ( chunk )
{
case "*" : result.type = tokens.STAR ; break ;
case "&" : result.type = tokens.AMPERSAND ; break ;
case "<" : result.type = tokens.LT ; break ;
case ">" : result.type = tokens.GT ; break ;
case "[" :
{
result.type = tokens.BRACKETED ;
char [] following = "" ;
while ( iterator < slices.length && ( following = slices [ iterator ++ ].value ) != "]" )
{
result.value ~= " " ~ following ;
}
result.value ~= " " ~ following ; // The last "]".
} break ;
case ";" :
{
result.type = tokens.SEMICOLON ;
this.state = "braced" ;
break ;
}
case ":" : result.type = tokens.COLON ; break ;
case "{" :
{ result.type = tokens.LBRACE ;
if ( this.state == "braced" )
{
result.type = tokens.BRACED ;
uint count = 1 ; // Number of closing brace '}' we search for.
char [] following = "" ;
while ( count && iterator < slices.length )
{
following = slices [ iterator ++ ].value ;
switch ( following )
{
case "{" : ++ count ; break ;
case "}" : -- count ; break ;
default : {}
}
result.value ~= " " ~ following ;
}
}
this.state = "braced" ;
} break ;
case "}" : result.type = tokens.RBRACE ; break ;
case "(" : result.type = tokens.LPAREN ; break ;
case ")" : result.type = tokens.RPAREN ; break ;
case "~" : result.type = tokens.TILDA ; break ;
case "," : result.type = tokens.COMMA ; break ;
case "=" : result.type = tokens.ASSIGN ; break ;
case "0" : result.type = tokens.ZERO ; break ;
case "..." : result.type = tokens.ELLIPSIS ; break ;
default : {}
}
}
this.currentToken = result ;
return result ;
}
unittest
{
auto lexer = new Lexer ( "extern ; unsigned char ; typedef ; const * { & { ** } } virtual [][ 42 ]" );
// lexer.state = "braced" ;
char [][] values ;
Token token ;
while (( token = lexer.next ).type != -1 )
{
cout << token << \n ;
values ~= token.value ;
}
writefln ( values );
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment