Skip to content

Instantly share code, notes, and snippets.

@Irwin1985
Created September 28, 2020 15:36
Show Gist options
  • Save Irwin1985/f7a55ee48ca028d370996af20cad8a17 to your computer and use it in GitHub Desktop.
Save Irwin1985/f7a55ee48ca028d370996af20cad8a17 to your computer and use it in GitHub Desktop.
A single yet powerfull State based Tokenizer.
*~~~~ TOKENIZER BEGIN ~~~~*
&& ======================================================================== &&
&& Class Tokenizer
&& ======================================================================== &&
Define Class Tokenizer As Custom
tokentype = .Null.
tokenstate = .Null.
&& ======================================================================== &&
&& Function Init
&& ======================================================================== &&
Function Init
This.tokenstate = Createobject("TokenizeState")
this.tokentype = Createobject("TokenType")
Endfunc
&& ======================================================================== &&
&& Function IsOp
&& ======================================================================== &&
Function IsOp(Chr As Character) As Boolean
addOp = Chr == '+' Or Chr == '-'
mulOp = Chr == '*' Or Chr == '/'
compOp = Chr == '<' Or Chr == '>' Or Chr == '='
lgicOp = Chr == '!' Or Chr == '|' Or Chr == '&'
Return addOp Or mulOp Or compOp Or lgicOp
Endfunc
&& ======================================================================== &&
&& Function FindOpType
&& ======================================================================== &&
Function FindOpType(firstOperator As Character, nextChar As Character)
_type = This.tokentype.UNKNOWN
Do Case
Case firstOperator == '+'
_type = This.tokentype.Add
Case firstOperator == '-'
_type = This.tokentype.SUBTRACT
Case firstOperator == '*'
_type = This.tokentype.MULTIPLY
Case firstOperator == '/'
_type = This.tokentype.DIVIDE
Case firstOperator == '<'
_type = This.tokentype.LESS
If nextChar == '='
_type = This.tokentype.LESSEQUAL
Endif
Case firstOperator == '>'
_type = This.tokentype.GREATER
If nextChar == '='
_type = This.tokentype.GREATEREQUAL
Endif
Case firstOperator == '='
_type = This.tokentype.ASSIGNMENT
If nextChar == '='
_type = This.tokentype.EQUAL
EndIf
Case firstOperator == '!'
_type = This.tokentype.Not
If nextChar == '='
_type = This.tokentype.NOTEQUAL
Endif
Case firstOperator == '|'
_type = This.tokentype.Or
Case firstOperator == '&'
_type = This.tokentype.And
Endcase
Return _type
Endfunc
&& ======================================================================== &&
&& Function IsParen
&& ======================================================================== &&
Function IsParen(Chr As Character)
prntOp = Chr == '(' Or Chr == ')'
brktOp = chr == '[' or chr == ']'
puncOp = chr == ','
Return prntOp or brktOp or puncOp
Endfunc
&& ======================================================================== &&
&& function FindParenType
&& ======================================================================== &&
Function FindParenType(Chr As Character)
_type = This.tokentype.UNKNOWN
Do Case
Case Chr == '('
_type = This.tokentype.LEFT_PAREN
Case Chr == ')'
_type = This.tokentype.RIGHT_PAREN
Case Chr == '['
_type = This.tokentype.LEFT_BRACKET
Case Chr == ']'
_type = This.tokentype.RIGHT_BRACKET
Case Chr == ','
_type = This.tokentype.COMMA
Endcase
Return _type
Endfunc
&& ======================================================================== &&
&& Function Tokenize
&& ======================================================================== &&
Function Tokenize(Source As String) As Collection
tokens = Createobject("Collection")
token = .Null.
tokenText = ""
firstOperator = '0'
state = This.tokenstate.DEFAULT
For index = 1 to Len(source) step 1
Chr = Substr(Source, Index, 1)
Do Case
Case state = This.tokenstate.DEFAULT
Do Case
Case This.IsOp(Chr) && Puede ser un operador sencillo o compuesto.
firstOperator = Chr
opType = This.FindOpType(firstOperator, '0')
token = Createobject("Token", Chr, opType) && lo crea por si a las moscas es uno.
state = This.TokenState.OPERATOR
Case Isdigit(Chr)
tokenText = tokenText + Chr
state = This.TokenState.NUMBER
Case This.IsParen(Chr)
parenType = This.FindParenType(Chr)
tokens.Add(Createobject("Token", Chr, parenType))
Case IsAlpha(Chr)
tokenText = tokenText + chr
state = This.TokenState.KEYWORD
Case chr == '"'
state = This.TokenState.STRING
Case chr == '#'
state = This.TokenState._COMMENT
Endcase
Case state = This.tokenstate.OPERATOR && Puede ser un operador compuesto (<=, >=, ==, !=) u otra cosa.
If This.IsOp(Chr)
opType = This.FindOpType(firstOperator, Chr)
token = Createobject("Token", firstOperator + Chr, opType) && machaca a token con el operador compuesto.
Else
tokens.Add(token) && Agrego el operador del estado anterior. (operador inicial).
state = This.tokenstate.DEFAULT
index = index - 1
Endif
Case state = This.tokenstate.NUMBER
If Isdigit(Chr)
tokenText = tokenText + Chr
Else
tokens.Add(Createobject("Token", tokenText, This.tokenstate.Number))
tokenText = ""
state = This.tokenstate.DEFAULT
Index = Index - 1
EndIf
Case state = This.TokenState.KEYWORD
If IsAlpha(chr) or IsDigit(chr)
tokenText = tokenText + chr
Else
_type = This.FindStatementType(tokenText)
tokens.Add(Createobject("Token", tokenText, _type))
tokenText = ""
state = This.tokenstate.DEFAULT
Index = Index - 1
EndIf
Case state = This.TokenState.STRING
If chr == '"'
tokens.Add(Createobject("Token", tokenText, This.TokenState.STRING))
tokenText = ""
state = This.tokenstate.DEFAULT
Else
tokenText = tokenText + chr
EndIf
Case state = This.TokenState._COMMENT
If chr == Chr(13) or chr == Chr(10)
state = This.tokenstate.DEFAULT
EndIf
Endcase
Endfor
Return tokens
EndFunc
&& ======================================================================== &&
&& Function FindStatementType
&& ======================================================================== &&
Function FindStatementType(str as string)
_type = This.TokenType.UNKNOWN
DO CASE
CASE str == "print"
_type = This.TokenType.PRINT
CASE str == "println"
_type = This.TokenType.PRINTLN
CASE str == "wait"
_type = This.TokenType.WAIT
CASE str == "script"
_type = This.TokenType.SCRIPT
CASE str == "end"
_type = This.TokenType.END
CASE str == "while"
_type = This.TokenType.WHILE
CASE str == "if"
_type = This.TokenType.IF
CASE str == "else"
_type = This.TokenType.ELSE
CASE str == "def"
_type = This.TokenType.DEF
CASE str == "print"
_type = This.TokenType.PRINT
CASE str == "println"
_type = This.TokenType.PRINTLN
CASE str == "wait"
_type = This.TokenType.WAIT
CASE str == "true"
_type = This.TokenType.TRUE
CASE str == "false"
_type = This.TokenType.FALSE
CASE str == "and"
_type = This.TokenType.AND
CASE str == "or"
_type = This.TokenType.OR
CASE str == "xor"
_type = This.TokenType.XOR
CASE str == "not"
_type = This.TokenType.NOT
CASE str == "for"
_type = This.TokenType.FOR
CASE str == "to"
_type = This.TokenType.TO
Otherwise
_type = This.TokenType.KEYWORD
ENDCASE
Return _type
EndFunc
Enddefine
*~~~~ TOKENIZER END ~~~~*
@Irwin1985
Copy link
Author

This version is not complete, its just a prototype tokenizer. Original will be state machine base too and will include header constants instead of class based attributes for the sake of performance.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment