Forked from CynicRus/gist:8f5d213d16c9b9eb75ef13dbd9bfa273
Created
October 25, 2020 16:17
-
-
Save edwinyzh/92bb24a806cec967f96498e80705eabb to your computer and use it in GitHub Desktop.
Golang tokenizer on Freepascal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unit utokenizer; | |
{$mode objfpc}{$H+} | |
interface | |
uses | |
Classes, SysUtils; | |
const | |
{$WARNINGS OFF} | |
IdentifierFirstSymbols: set of char = ['A' .. 'Z', 'a' .. 'z']; | |
IdentifierSymbols: set of char = ['A' .. 'Z', 'a' .. 'z', '0' .. '9']; | |
Digits: set of char = ['0' .. '9']; | |
HexDigits: set of char = ['A' .. 'F', 'a' .. 'f', '0' .. '9']; | |
ExponentDot = '.'; | |
ExponentE: set of char = ['E', 'e']; | |
Signs: set of char = ['+', '-']; | |
Constants: array [0..3] of string = ('true', 'false', 'iota', 'nil'); | |
Keywords: array [0 .. 24] of string = | |
('break', 'default', 'func', 'interface', 'select', | |
'case', 'defer', 'go', 'map', 'struct', | |
'chan', 'else', 'goto', 'package', 'switch', | |
'const', 'fallthrough', 'if', 'range', 'type', | |
'continue', 'for', 'import', 'return', 'var'); | |
Operators: array [0..46] of string = | |
('+', '&', '+=', '&=', '&&', '==', '!=', '(', ')', '-', '|', '-=', '|=', '||', | |
'<', '<=', '[', ']', '*', '^', '*=', '^=', '<-', '>', '>=', '{', '}', '/', '<<', | |
'/=', '<<=', '++', '=', ':=', ',', ';', '%', '>>', '%=', '>>=', | |
'--', '!', '...', '.', ':', '&^', '&^='); | |
PredeclaredTypes: array [0..19] of string = | |
('bool', 'byte', 'complex64', 'complex128', 'error', 'float32', 'float64', | |
'int', 'int8', 'int16', 'int32', 'int64', 'rune', 'string', 'uint', 'uint8', | |
'uint16', 'uint32', 'uint64', 'uintptr'); | |
PredeclaredFunctions: array [0..14] of string = | |
('append', 'cap', 'close', 'complex', 'copy', 'delete', 'imag', 'len', | |
'make', 'new', 'panic', 'print', 'println', 'real', 'recover'); | |
{$WARNINGS ON} | |
CharLineEnd = #10; | |
CommentsLine = '//'; | |
CommentsOpen = '/*'; | |
CommentsClose = '*/'; | |
type | |
TStringQuote = | |
( | |
sqSingle, | |
sqDouble, | |
sqSingleAndDouble | |
); | |
TTokenKind = (tkILLEGAL, tkEOF, tkCOMMENT, tkIDENT, | |
tkINT, tkFLOAT, tkIMAG, tkCHAR, tkSTRING, | |
tkADD, tkSUB, tkMUL, tkQUO, tkREM, tkAND, tkOR, tkXOR, tkSHL, tkSHR, | |
tkAND_NOT, tkADD_ASSIGN, tkSUB_ASSIGN, tkMUL_ASSIGN, tkQUO_ASSIGN, | |
tkREM_ASSIGN, tkAND_ASSIGN, tkOR_ASSIGN, tkXOR_ASSIGN, tkSHL_ASSIGN, | |
tkSHR_ASSIGN, tkAND_NOT_ASSIGN, tkLAND, tkLOR, tkARROW, tkINC, tkDEC, | |
tkEQL, tkLSS, tkGTR, tkASSIGN, tkNOT, tkNEQ, tkLEQ, tkGEQ, tkDEFINE, | |
tkELLIPSIS, tkLPAREN, tkLBRACK, tkLBRACE, tkCOMMA, tkPERIOD, tkRPAREN, | |
tkRBRACK, tkRBRACE, tkSEMICOLON, tkCOLON, | |
tkBREAK, tkCASE, tkCHAN, tkCONST, tkCONTINUE, tkDEFAULT, tkDEFER, | |
tkELSE, tkFALLTHROUGH, tkFOR, tkFUNC, tkGO, tkGOTO, tkIF, tkIMPORT, | |
tkINTERFACE, tkMAP, tkPACKAGE, tkRANGE, tkRETURN, tkSELECT, tkSTRUCT, | |
tkSWITCH, tkTYPE, tkVAR, tkSTRIDENT,tkSKIPRESULT, tkUnknown); | |
TPrecedence = (LowestPrec = 0 // non-operators | |
, UnaryPrec = 6, HighestPrec = 7); | |
const | |
StringQuote = sqSingleAndDouble; | |
type | |
PToken = ^TToken; | |
TToken = record | |
Token: string; | |
TextPos: integer; | |
LexType: TTokenKind; | |
end; | |
TTokens = array of TToken; | |
//TPTokens = array of PToken; | |
{ TTokenizer } | |
TTokenizer = class(TObject) | |
private | |
CurrPos: integer; | |
CurrTokenPos: integer; | |
CurrChar: char; | |
CurrToken: string; | |
FTokensList: TTokens; | |
FTokensCount: integer; | |
FSource: string; | |
procedure GetNextChar; | |
procedure Add(Token: string; Pos: integer; LType: TTokenKind); | |
procedure GetNumber; | |
procedure GetIdentifier; | |
procedure GetOthers; | |
function isKeyword(const Ident: string): boolean; | |
function isType(const Ident: string): boolean; | |
function IdentToTokenKind(const Ident: string): TTokenKind; | |
function NextChar: char; | |
{ function GenerateAnsi(const Str: string): string;} | |
function GetToken(ind: integer): TToken; | |
procedure SetSource(src: string); | |
public | |
constructor Create; | |
destructor Destroy; override; | |
procedure Analyze; | |
function GetNextToken: PToken; | |
function NextToken: PToken; | |
function GetCurrentToken: PToken; | |
function GetPreviousToken(const Step: integer): PToken; | |
property Token[ind: integer]: TToken read GetToken; | |
property TokensCount: integer read FTokensCount; | |
property Source: string read FSource write SetSource; | |
end; | |
implementation | |
uses utils; | |
{ TTokenizer } | |
procedure TTokenizer.Add(Token: string; Pos: integer; LType: TTokenKind); | |
begin | |
if Token = '' then | |
Exit; | |
Inc(FTokensCount); | |
SetLength(FTokensList, FTokensCount); | |
FTokensList[FTokensCount - 1].Token := Token; | |
FTokensList[FTokensCount - 1].TextPos := Pos; | |
FTokensList[FTokensCount - 1].LexType := LType; | |
end; | |
procedure TTokenizer.Analyze; | |
var | |
tokenType: TTokenKind; | |
begin | |
SetLength(FTokensList, 0); | |
GetNextChar; | |
while CurrChar <> #0 do | |
begin | |
if CurrPos > Length(FSource) then | |
Break; | |
case CurrChar of | |
'A' .. 'Z', 'a' .. 'z': | |
begin | |
CurrToken := ''; | |
GetIdentifier; | |
if isKeyword(CurrToken) then | |
tokenType := IdentToTokenKind(CurrToken) | |
else | |
if isType(LowerCase(CurrToken)) and (Length(CurrToken) > 2) then | |
tokenType := IdentToTokenKind(UpperCase(CurrToken)) | |
else | |
tokenType := tkIdent; | |
if CurrToken <> '' then | |
Add(CurrToken, (CurrPos - 1) - Length(CurrToken), tokenType); | |
CurrToken := ''; | |
end; | |
'0' .. '9': | |
begin | |
CurrToken := ''; | |
GetNumber; | |
if CurrToken <> '' then | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkInt); | |
CurrToken := ''; | |
end; | |
#1 .. #20, ' ': | |
begin | |
GetNextChar; | |
end; | |
else | |
begin | |
CurrToken := ''; | |
GetOthers; | |
if CurrToken <> '' then | |
begin | |
tokenType := IdentToTokenKind(CurrToken); | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tokenType); | |
end | |
else | |
GetNextChar; | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
//GetNextChar(); | |
end; | |
end; | |
end; | |
function TTokenizer.GetNextToken: PToken; | |
begin | |
Inc(CurrTokenPos); | |
if CurrTokenPos > FTokensCount - 1 then | |
result := nil | |
else | |
result := @FTokensList[CurrTokenPos]; | |
end; | |
function TTokenizer.NextToken: PToken; | |
begin | |
if CurrTokenPos + 1 > FTokensCount - 1 then | |
Result := nil | |
else | |
Result := @FTokensList[CurrTokenPos+1]; | |
end; | |
function TTokenizer.GetCurrentToken: PToken; | |
begin | |
if CurrTokenPos > (FTokensCount - 1) then | |
Exit; | |
Result := @FTokensList[CurrTokenPos]; | |
end; | |
function TTokenizer.GetPreviousToken(const Step: integer): PToken; | |
begin | |
if (CurrTokenPos = 0) or ((CurrTokenPos - Step) < 0) then | |
Exit; | |
Result := @FTokensList[CurrTokenPos-step]; | |
end; | |
constructor TTokenizer.Create; | |
begin | |
inherited; | |
FTokensList := nil; | |
FTokensCount := 0; | |
CurrPos := 0; | |
CurrTokenPos := 0; | |
CurrChar := #0; | |
CurrToken := ''; | |
end; | |
destructor TTokenizer.Destroy; | |
begin | |
SetLength(FTokensList, 0); | |
inherited; | |
end; | |
procedure TTokenizer.GetIdentifier; | |
begin | |
CurrToken := CurrChar; | |
GetNextChar; | |
while CharInSet(CurrChar, IdentifierSymbols) do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
end; | |
function TTokenizer.GetToken(ind: integer): TToken; | |
begin | |
if ind > (FTokensCount - 1) then | |
Exit; | |
Result := FTokensList[ind]; | |
end; | |
procedure TTokenizer.GetNextChar; | |
begin | |
Inc(CurrPos); | |
if CurrPos > Length(FSource) then | |
CurrChar := #0 | |
else | |
CurrChar := FSource[CurrPos]; | |
end; | |
procedure TTokenizer.GetNumber; | |
begin | |
CurrToken := CurrChar; | |
GetNextChar; | |
while CharInSet(CurrChar, Digits) do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
if CurrChar <> ExponentDot then | |
Exit; | |
if not CharInSet(NextChar, Digits) then | |
Exit; | |
CurrToken := CurrToken + ExponentDot; | |
GetNextChar; | |
while CharInSet(CurrChar, Digits) do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
if not CharInSet(CurrChar, ExponentE) then | |
Exit; | |
CurrToken := CurrToken + 'E'; | |
GetNextChar; | |
if CharInSet(CurrChar, Signs) then | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
while CharInSet(CurrChar, Digits) do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
end; | |
procedure TTokenizer.GetOthers; | |
var | |
i: integer; | |
begin | |
case CurrChar of | |
';', '(', ')', '[', ']', ',', '@', '{', '}','_': | |
begin | |
CurrToken := CurrChar; | |
GetNextChar; | |
Exit; | |
end; | |
'!': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'.': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '.') then | |
begin | |
while NextChar = '.' do | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
':': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'=': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'&': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') or (NextChar = '^') or (NextChar = '&') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'<': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '<') or (NextChar = '-') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'>': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '>') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'|': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') or (NextChar = '|') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'^': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'%': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'+': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') or (NextChar = '+') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'-': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') or (NextChar = '-') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'*': | |
begin | |
CurrToken := CurrChar; | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
end; | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), | |
IdentToTokenKind(CurrToken)); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
'"': | |
begin | |
if (StringQuote = sqDouble) or (StringQuote = sqSingleAndDouble) then | |
begin | |
Add('"', CurrPos, tkStrIdent); | |
GetNextChar; | |
while CurrChar <> '"' do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
if CurrToken <> '' then | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkString); | |
CurrToken := '"'; | |
GetNextChar; | |
{ Add('"', CurrPos, 2); | |
GetNextChar; } | |
end; | |
Exit; | |
end; | |
'/': | |
begin | |
if NextChar = '*' then | |
begin | |
GetNextChar; | |
CurrToken := '/'; | |
while CurrChar <> '/' do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
CurrToken := CurrToken + '/'; | |
if CurrToken <> '' then | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment); | |
CurrToken := ''; | |
//GetNextChar; | |
end | |
else | |
if NextChar = '/' then | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
while CurrChar <> #10 do | |
begin | |
CurrToken := CurrToken + CurrChar; | |
GetNextChar; | |
end; | |
if CurrToken <> '' then | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment); | |
CurrToken := ''; | |
//GetNextChar; | |
end | |
else | |
if (NextChar = '=') then | |
begin | |
GetNextChar; | |
CurrToken := CurrToken + CurrChar; | |
if CurrToken <> '' then | |
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment); | |
CurrToken := ''; | |
//GetNextChar; | |
end; | |
end | |
else | |
Exit; | |
end; | |
end; | |
function TTokenizer.isKeyword(const Ident: string): boolean; | |
var | |
i: integer; | |
begin | |
Result := False; | |
for i := low(Keywords) to high(Keywords) do | |
if eq(Keywords[i], Ident) then | |
begin | |
Result := True; | |
break; | |
end; | |
end; | |
function TTokenizer.isType(const Ident: string): boolean; | |
var | |
i: integer; | |
begin | |
Result := False; | |
for i := low(PredeclaredTypes) to high(PredeclaredTypes) do | |
if Pos(ident, predeclaredTypes[i]) > 0 then | |
begin | |
Result := True; | |
break; | |
end; | |
end; | |
function TTokenizer.IdentToTokenKind(const Ident: string): TTokenKind; | |
begin | |
Result := tkUnknown; | |
case ident of | |
'ILLEGAL': Result := tkILLEGAL; | |
'EOF': Result := tkEOF; | |
'COMMENT': Result := tkCOMMENT; | |
'IDENT': Result := tkIDENT; | |
'INT': Result := tkINT; | |
'FLOAT': Result := tkFLOAT; | |
'IMAG': Result := tkIMAG; | |
'CHAR': Result := tkCHAR; | |
'STRING': Result := tkSTRING; | |
'_': Result := tkSKIPRESULT; | |
'+': Result := tkADD; | |
'-': Result := tkSUB; | |
'*': Result := tkMUL; | |
'/': Result := tkQUO; | |
'%': Result := tkREM; | |
'&': Result := tkAND; | |
'|': Result := tkOR; | |
'^': Result := tkXOR; | |
'<<': Result := tkSHL; | |
'>>': Result := tkSHR; | |
'&^': Result := tkAND_NOT; | |
'+=': Result := tkADD_ASSIGN; | |
'-=': Result := tkSUB_ASSIGN; | |
'*=': Result := tkMUL_ASSIGN; | |
'/=': Result := tkQUO_ASSIGN; | |
'%=': Result := tkREM_ASSIGN; | |
'&=': Result := tkAND_ASSIGN; | |
'|=': Result := tkOR_ASSIGN; | |
'^=': Result := tkXOR_ASSIGN; | |
'<<=': Result := tkSHL_ASSIGN; | |
'>>=': Result := tkSHR_ASSIGN; | |
'&^=': Result := tkAND_NOT_ASSIGN; | |
'&&': Result := tkLAND; | |
'||': Result := tkLOR; | |
'<-': Result := tkARROW; | |
'++': Result := tkINC; | |
'--': Result := tkDEC; | |
'==': Result := tkEQL; | |
'<': Result := tkLSS; | |
'>': Result := tkGTR; | |
'=': Result := tkASSIGN; | |
'!': Result := tkNOT; | |
'!=': Result := tkNEQ; | |
'<=': Result := tkLEQ; | |
'>=': Result := tkGEQ; | |
':=': Result := tkDEFINE; | |
'...': Result := tkELLIPSIS; | |
'(': Result := tkLPAREN; | |
'[': Result := tkLBRACK; | |
'{': Result := tkLBRACE; | |
'.': Result := tkPERIOD; | |
',': Result := tkCOMMA; | |
')': Result := tkRPAREN; | |
']': Result := tkRBRACK; | |
'}': Result := tkRBRACE; | |
';': Result := tkSEMICOLON; | |
':': Result := tkCOLON; | |
'"': Result := tkSTRIDENT; | |
'break': Result := tkBREAK; | |
'case': Result := tkCASE; | |
'chan': Result := tkCHAN; | |
'const': Result := tkCONST; | |
'continue': Result := tkCONTINUE; | |
'default': Result := tkDEFAULT; | |
'defer': Result := tkDEFER; | |
'else': Result := tkELSE; | |
'fallthrough': Result := tkFALLTHROUGH; | |
'for': Result := tkFOR; | |
'func': Result := tkFUNC; | |
'go': Result := tkGO; | |
'goto': Result := tkGOTO; | |
'if': Result := tkIF; | |
'import': Result := tkIMPORT; | |
'interface': Result := tkINTERFACE; | |
'map': Result := tkMAP; | |
'package': Result := tkPACKAGE; | |
'range': Result := tkRANGE; | |
'return': Result := tkRETURN; | |
'select': Result := tkSELECT; | |
'struct': Result := tkSTRUCT; | |
'switch': Result := tkSWITCH; | |
'type': Result := tkTYPE; | |
'var': Result := tkVAR; | |
end; | |
end; | |
function TTokenizer.NextChar: char; | |
begin | |
if CurrPos + 1 > Length(FSource) then | |
Result := #0 | |
else | |
Result := FSource[CurrPos + 1]; | |
end; | |
procedure TTokenizer.SetSource(src: string); | |
var | |
I, j: integer; | |
tmp: string; | |
begin | |
try | |
tmp := src; | |
finally | |
Self.FSource := tmp; | |
tmp := ''; | |
end; | |
end; | |
end. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment