Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edwinyzh/92bb24a806cec967f96498e80705eabb to your computer and use it in GitHub Desktop.
Save edwinyzh/92bb24a806cec967f96498e80705eabb to your computer and use it in GitHub Desktop.
Golang tokenizer on Freepascal
unit utokenizer;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils;
const
{$WARNINGS OFF}
IdentifierFirstSymbols: set of char = ['A' .. 'Z', 'a' .. 'z'];
IdentifierSymbols: set of char = ['A' .. 'Z', 'a' .. 'z', '0' .. '9'];
Digits: set of char = ['0' .. '9'];
HexDigits: set of char = ['A' .. 'F', 'a' .. 'f', '0' .. '9'];
ExponentDot = '.';
ExponentE: set of char = ['E', 'e'];
Signs: set of char = ['+', '-'];
Constants: array [0..3] of string = ('true', 'false', 'iota', 'nil');
Keywords: array [0 .. 24] of string =
('break', 'default', 'func', 'interface', 'select',
'case', 'defer', 'go', 'map', 'struct',
'chan', 'else', 'goto', 'package', 'switch',
'const', 'fallthrough', 'if', 'range', 'type',
'continue', 'for', 'import', 'return', 'var');
Operators: array [0..46] of string =
('+', '&', '+=', '&=', '&&', '==', '!=', '(', ')', '-', '|', '-=', '|=', '||',
'<', '<=', '[', ']', '*', '^', '*=', '^=', '<-', '>', '>=', '{', '}', '/', '<<',
'/=', '<<=', '++', '=', ':=', ',', ';', '%', '>>', '%=', '>>=',
'--', '!', '...', '.', ':', '&^', '&^=');
PredeclaredTypes: array [0..19] of string =
('bool', 'byte', 'complex64', 'complex128', 'error', 'float32', 'float64',
'int', 'int8', 'int16', 'int32', 'int64', 'rune', 'string', 'uint', 'uint8',
'uint16', 'uint32', 'uint64', 'uintptr');
PredeclaredFunctions: array [0..14] of string =
('append', 'cap', 'close', 'complex', 'copy', 'delete', 'imag', 'len',
'make', 'new', 'panic', 'print', 'println', 'real', 'recover');
{$WARNINGS ON}
CharLineEnd = #10;
CommentsLine = '//';
CommentsOpen = '/*';
CommentsClose = '*/';
type
TStringQuote =
(
sqSingle,
sqDouble,
sqSingleAndDouble
);
TTokenKind = (tkILLEGAL, tkEOF, tkCOMMENT, tkIDENT,
tkINT, tkFLOAT, tkIMAG, tkCHAR, tkSTRING,
tkADD, tkSUB, tkMUL, tkQUO, tkREM, tkAND, tkOR, tkXOR, tkSHL, tkSHR,
tkAND_NOT, tkADD_ASSIGN, tkSUB_ASSIGN, tkMUL_ASSIGN, tkQUO_ASSIGN,
tkREM_ASSIGN, tkAND_ASSIGN, tkOR_ASSIGN, tkXOR_ASSIGN, tkSHL_ASSIGN,
tkSHR_ASSIGN, tkAND_NOT_ASSIGN, tkLAND, tkLOR, tkARROW, tkINC, tkDEC,
tkEQL, tkLSS, tkGTR, tkASSIGN, tkNOT, tkNEQ, tkLEQ, tkGEQ, tkDEFINE,
tkELLIPSIS, tkLPAREN, tkLBRACK, tkLBRACE, tkCOMMA, tkPERIOD, tkRPAREN,
tkRBRACK, tkRBRACE, tkSEMICOLON, tkCOLON,
tkBREAK, tkCASE, tkCHAN, tkCONST, tkCONTINUE, tkDEFAULT, tkDEFER,
tkELSE, tkFALLTHROUGH, tkFOR, tkFUNC, tkGO, tkGOTO, tkIF, tkIMPORT,
tkINTERFACE, tkMAP, tkPACKAGE, tkRANGE, tkRETURN, tkSELECT, tkSTRUCT,
tkSWITCH, tkTYPE, tkVAR, tkSTRIDENT,tkSKIPRESULT, tkUnknown);
TPrecedence = (LowestPrec = 0 // non-operators
, UnaryPrec = 6, HighestPrec = 7);
const
StringQuote = sqSingleAndDouble;
type
PToken = ^TToken;
TToken = record
Token: string;
TextPos: integer;
LexType: TTokenKind;
end;
TTokens = array of TToken;
//TPTokens = array of PToken;
{ TTokenizer }
TTokenizer = class(TObject)
private
CurrPos: integer;
CurrTokenPos: integer;
CurrChar: char;
CurrToken: string;
FTokensList: TTokens;
FTokensCount: integer;
FSource: string;
procedure GetNextChar;
procedure Add(Token: string; Pos: integer; LType: TTokenKind);
procedure GetNumber;
procedure GetIdentifier;
procedure GetOthers;
function isKeyword(const Ident: string): boolean;
function isType(const Ident: string): boolean;
function IdentToTokenKind(const Ident: string): TTokenKind;
function NextChar: char;
{ function GenerateAnsi(const Str: string): string;}
function GetToken(ind: integer): TToken;
procedure SetSource(src: string);
public
constructor Create;
destructor Destroy; override;
procedure Analyze;
function GetNextToken: PToken;
function NextToken: PToken;
function GetCurrentToken: PToken;
function GetPreviousToken(const Step: integer): PToken;
property Token[ind: integer]: TToken read GetToken;
property TokensCount: integer read FTokensCount;
property Source: string read FSource write SetSource;
end;
implementation
uses utils;
{ TTokenizer }
procedure TTokenizer.Add(Token: string; Pos: integer; LType: TTokenKind);
begin
if Token = '' then
Exit;
Inc(FTokensCount);
SetLength(FTokensList, FTokensCount);
FTokensList[FTokensCount - 1].Token := Token;
FTokensList[FTokensCount - 1].TextPos := Pos;
FTokensList[FTokensCount - 1].LexType := LType;
end;
procedure TTokenizer.Analyze;
var
tokenType: TTokenKind;
begin
SetLength(FTokensList, 0);
GetNextChar;
while CurrChar <> #0 do
begin
if CurrPos > Length(FSource) then
Break;
case CurrChar of
'A' .. 'Z', 'a' .. 'z':
begin
CurrToken := '';
GetIdentifier;
if isKeyword(CurrToken) then
tokenType := IdentToTokenKind(CurrToken)
else
if isType(LowerCase(CurrToken)) and (Length(CurrToken) > 2) then
tokenType := IdentToTokenKind(UpperCase(CurrToken))
else
tokenType := tkIdent;
if CurrToken <> '' then
Add(CurrToken, (CurrPos - 1) - Length(CurrToken), tokenType);
CurrToken := '';
end;
'0' .. '9':
begin
CurrToken := '';
GetNumber;
if CurrToken <> '' then
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkInt);
CurrToken := '';
end;
#1 .. #20, ' ':
begin
GetNextChar;
end;
else
begin
CurrToken := '';
GetOthers;
if CurrToken <> '' then
begin
tokenType := IdentToTokenKind(CurrToken);
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tokenType);
end
else
GetNextChar;
CurrToken := '';
//GetNextChar;
end;
//GetNextChar();
end;
end;
end;
function TTokenizer.GetNextToken: PToken;
begin
Inc(CurrTokenPos);
if CurrTokenPos > FTokensCount - 1 then
result := nil
else
result := @FTokensList[CurrTokenPos];
end;
function TTokenizer.NextToken: PToken;
begin
if CurrTokenPos + 1 > FTokensCount - 1 then
Result := nil
else
Result := @FTokensList[CurrTokenPos+1];
end;
function TTokenizer.GetCurrentToken: PToken;
begin
if CurrTokenPos > (FTokensCount - 1) then
Exit;
Result := @FTokensList[CurrTokenPos];
end;
function TTokenizer.GetPreviousToken(const Step: integer): PToken;
begin
if (CurrTokenPos = 0) or ((CurrTokenPos - Step) < 0) then
Exit;
Result := @FTokensList[CurrTokenPos-step];
end;
constructor TTokenizer.Create;
begin
inherited;
FTokensList := nil;
FTokensCount := 0;
CurrPos := 0;
CurrTokenPos := 0;
CurrChar := #0;
CurrToken := '';
end;
destructor TTokenizer.Destroy;
begin
SetLength(FTokensList, 0);
inherited;
end;
procedure TTokenizer.GetIdentifier;
begin
CurrToken := CurrChar;
GetNextChar;
while CharInSet(CurrChar, IdentifierSymbols) do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
end;
function TTokenizer.GetToken(ind: integer): TToken;
begin
if ind > (FTokensCount - 1) then
Exit;
Result := FTokensList[ind];
end;
procedure TTokenizer.GetNextChar;
begin
Inc(CurrPos);
if CurrPos > Length(FSource) then
CurrChar := #0
else
CurrChar := FSource[CurrPos];
end;
procedure TTokenizer.GetNumber;
begin
CurrToken := CurrChar;
GetNextChar;
while CharInSet(CurrChar, Digits) do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
if CurrChar <> ExponentDot then
Exit;
if not CharInSet(NextChar, Digits) then
Exit;
CurrToken := CurrToken + ExponentDot;
GetNextChar;
while CharInSet(CurrChar, Digits) do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
if not CharInSet(CurrChar, ExponentE) then
Exit;
CurrToken := CurrToken + 'E';
GetNextChar;
if CharInSet(CurrChar, Signs) then
CurrToken := CurrToken + CurrChar;
GetNextChar;
while CharInSet(CurrChar, Digits) do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
end;
procedure TTokenizer.GetOthers;
var
i: integer;
begin
case CurrChar of
';', '(', ')', '[', ']', ',', '@', '{', '}','_':
begin
CurrToken := CurrChar;
GetNextChar;
Exit;
end;
'!':
begin
CurrToken := CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'.':
begin
CurrToken := CurrChar;
if (NextChar = '.') then
begin
while NextChar = '.' do
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
':':
begin
CurrToken := CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'=':
begin
CurrToken := CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'&':
begin
CurrToken := CurrChar;
if (NextChar = '=') or (NextChar = '^') or (NextChar = '&') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'<':
begin
CurrToken := CurrChar;
if (NextChar = '<') or (NextChar = '-') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'>':
begin
CurrToken := CurrChar;
if (NextChar = '>') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'|':
begin
CurrToken := CurrChar;
if (NextChar = '=') or (NextChar = '|') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'^':
begin
CurrToken := CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'%':
begin
CurrToken := CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'+':
begin
CurrToken := CurrChar;
if (NextChar = '=') or (NextChar = '+') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'-':
begin
CurrToken := CurrChar;
if (NextChar = '=') or (NextChar = '-') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'*':
begin
CurrToken := CurrChar;
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
end;
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
IdentToTokenKind(CurrToken));
CurrToken := '';
//GetNextChar;
end;
'"':
begin
if (StringQuote = sqDouble) or (StringQuote = sqSingleAndDouble) then
begin
Add('"', CurrPos, tkStrIdent);
GetNextChar;
while CurrChar <> '"' do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
if CurrToken <> '' then
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkString);
CurrToken := '"';
GetNextChar;
{ Add('"', CurrPos, 2);
GetNextChar; }
end;
Exit;
end;
'/':
begin
if NextChar = '*' then
begin
GetNextChar;
CurrToken := '/';
while CurrChar <> '/' do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
CurrToken := CurrToken + '/';
if CurrToken <> '' then
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment);
CurrToken := '';
//GetNextChar;
end
else
if NextChar = '/' then
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
while CurrChar <> #10 do
begin
CurrToken := CurrToken + CurrChar;
GetNextChar;
end;
if CurrToken <> '' then
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment);
CurrToken := '';
//GetNextChar;
end
else
if (NextChar = '=') then
begin
GetNextChar;
CurrToken := CurrToken + CurrChar;
if CurrToken <> '' then
Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment);
CurrToken := '';
//GetNextChar;
end;
end
else
Exit;
end;
end;
function TTokenizer.isKeyword(const Ident: string): boolean;
var
i: integer;
begin
Result := False;
for i := low(Keywords) to high(Keywords) do
if eq(Keywords[i], Ident) then
begin
Result := True;
break;
end;
end;
function TTokenizer.isType(const Ident: string): boolean;
var
i: integer;
begin
Result := False;
for i := low(PredeclaredTypes) to high(PredeclaredTypes) do
if Pos(ident, predeclaredTypes[i]) > 0 then
begin
Result := True;
break;
end;
end;
function TTokenizer.IdentToTokenKind(const Ident: string): TTokenKind;
begin
Result := tkUnknown;
case ident of
'ILLEGAL': Result := tkILLEGAL;
'EOF': Result := tkEOF;
'COMMENT': Result := tkCOMMENT;
'IDENT': Result := tkIDENT;
'INT': Result := tkINT;
'FLOAT': Result := tkFLOAT;
'IMAG': Result := tkIMAG;
'CHAR': Result := tkCHAR;
'STRING': Result := tkSTRING;
'_': Result := tkSKIPRESULT;
'+': Result := tkADD;
'-': Result := tkSUB;
'*': Result := tkMUL;
'/': Result := tkQUO;
'%': Result := tkREM;
'&': Result := tkAND;
'|': Result := tkOR;
'^': Result := tkXOR;
'<<': Result := tkSHL;
'>>': Result := tkSHR;
'&^': Result := tkAND_NOT;
'+=': Result := tkADD_ASSIGN;
'-=': Result := tkSUB_ASSIGN;
'*=': Result := tkMUL_ASSIGN;
'/=': Result := tkQUO_ASSIGN;
'%=': Result := tkREM_ASSIGN;
'&=': Result := tkAND_ASSIGN;
'|=': Result := tkOR_ASSIGN;
'^=': Result := tkXOR_ASSIGN;
'<<=': Result := tkSHL_ASSIGN;
'>>=': Result := tkSHR_ASSIGN;
'&^=': Result := tkAND_NOT_ASSIGN;
'&&': Result := tkLAND;
'||': Result := tkLOR;
'<-': Result := tkARROW;
'++': Result := tkINC;
'--': Result := tkDEC;
'==': Result := tkEQL;
'<': Result := tkLSS;
'>': Result := tkGTR;
'=': Result := tkASSIGN;
'!': Result := tkNOT;
'!=': Result := tkNEQ;
'<=': Result := tkLEQ;
'>=': Result := tkGEQ;
':=': Result := tkDEFINE;
'...': Result := tkELLIPSIS;
'(': Result := tkLPAREN;
'[': Result := tkLBRACK;
'{': Result := tkLBRACE;
'.': Result := tkPERIOD;
',': Result := tkCOMMA;
')': Result := tkRPAREN;
']': Result := tkRBRACK;
'}': Result := tkRBRACE;
';': Result := tkSEMICOLON;
':': Result := tkCOLON;
'"': Result := tkSTRIDENT;
'break': Result := tkBREAK;
'case': Result := tkCASE;
'chan': Result := tkCHAN;
'const': Result := tkCONST;
'continue': Result := tkCONTINUE;
'default': Result := tkDEFAULT;
'defer': Result := tkDEFER;
'else': Result := tkELSE;
'fallthrough': Result := tkFALLTHROUGH;
'for': Result := tkFOR;
'func': Result := tkFUNC;
'go': Result := tkGO;
'goto': Result := tkGOTO;
'if': Result := tkIF;
'import': Result := tkIMPORT;
'interface': Result := tkINTERFACE;
'map': Result := tkMAP;
'package': Result := tkPACKAGE;
'range': Result := tkRANGE;
'return': Result := tkRETURN;
'select': Result := tkSELECT;
'struct': Result := tkSTRUCT;
'switch': Result := tkSWITCH;
'type': Result := tkTYPE;
'var': Result := tkVAR;
end;
end;
function TTokenizer.NextChar: char;
begin
if CurrPos + 1 > Length(FSource) then
Result := #0
else
Result := FSource[CurrPos + 1];
end;
procedure TTokenizer.SetSource(src: string);
var
I, j: integer;
tmp: string;
begin
try
tmp := src;
finally
Self.FSource := tmp;
tmp := '';
end;
end;
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment