Last active
November 10, 2023 13:49
-
-
Save erdesigns-eu/5c9064a99d4f9b0b05cfb53a9fee2b11 to your computer and use it in GitHub Desktop.
Pascal Lexer implemented in Typescript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type TokenType = 'KEYWORD' | 'IDENTIFIER' | 'NUMBER' | 'STRING' | 'OPERATOR' | 'DELIMITER' | 'EOF' | 'UNKNOWN' | 'PROCEDURE' | 'FUNCTION' | 'CONSTANT' | 'RECORD'; | |
class Token { | |
constructor(public type: TokenType, public value: string, public position: number, public line: number, public column: number) {} | |
} | |
class Lexer { | |
private pos: number = 0; | |
private line: number = 1; | |
private column: number = 0; | |
private currentChar: string | null = this.text[this.pos]; | |
constructor(private text: string) { | |
this.advance(); | |
} | |
private peek(): string | null { | |
const peekPos = this.pos + 1; | |
if (peekPos < this.text.length) { | |
return this.text[peekPos]; | |
} else { | |
return null; | |
} | |
} | |
private advance(): void { | |
if (this.currentChar === '\n') { | |
this.line++; | |
this.column = 0; | |
} else { | |
this.column++; | |
} | |
this.pos++; | |
this.currentChar = this.pos < this.text.length ? this.text[this.pos] : null; | |
} | |
private error(): never { | |
throw new Error(`Lexer error at position ${this.pos} (Line: ${this.line}, Column: ${this.column}): Unexpected character '${this.currentChar}'`); | |
} | |
private skipWhitespace(): void { | |
while (this.currentChar !== null && /\s/.test(this.currentChar)) { | |
this.advance(); | |
} | |
} | |
private skipComment(): void { | |
while (this.currentChar !== '}') { | |
this.advance(); | |
if (this.currentChar === null) { | |
this.error(); | |
} | |
} | |
// Skip the closing curly brace | |
this.advance(); | |
} | |
private number(): Token { | |
let result = ''; | |
while (this.currentChar !== null && /\d/.test(this.currentChar)) { | |
result += this.currentChar; | |
this.advance(); | |
} | |
if (this.currentChar === '.') { | |
result += this.currentChar; | |
this.advance(); | |
while (this.currentChar !== null && /\d/.test(this.currentChar)) { | |
result += this.currentChar; | |
this.advance(); | |
} | |
} | |
// Handle exponent part | |
if (this.currentChar?.toLowerCase() === 'e') { | |
result += this.currentChar; | |
this.advance(); | |
if (this.currentChar === '+' || this.currentChar === '-') { | |
result += this.currentChar; | |
this.advance(); | |
} | |
while (this.currentChar !== null && /\d/.test(this.currentChar)) { | |
result += this.currentChar; | |
this.advance(); | |
} | |
} | |
return new Token('NUMBER', result, this.pos, this.line, this.column); | |
} | |
private identifier(): Token { | |
let result = ''; | |
while (this.currentChar !== null && /[a-zA-Z_]/.test(this.currentChar)) { | |
result += this.currentChar; | |
this.advance(); | |
} | |
const operatorWords = new Set(['mod', 'div', 'and', 'or', 'not', 'xor']); | |
const isFollowedByOperatorChar = this.currentChar === null || /[\s;:,.()\[\]]/.test(this.currentChar); | |
if (operatorWords.has(result.toLowerCase()) && isFollowedByOperatorChar) { | |
return new Token('OPERATOR', result, this.pos, this.line, this.column); | |
} | |
const keywords = new Set([ | |
'program', 'var', 'begin', 'end', 'if', 'then', 'else', 'while', 'do', 'for', 'to', 'procedure', 'function', 'const', 'record' | |
]); | |
if (keywords.has(result.toLowerCase())) { | |
switch (result.toLowerCase()) { | |
case 'procedure': | |
return new Token('PROCEDURE', result, this.pos, this.line, this.column); | |
case 'function': | |
return new Token('FUNCTION', result, this.pos, this.line, this.column); | |
case 'const': | |
return new Token('CONSTANT', result, this.pos, this.line, this.column); | |
case 'record': | |
return new Token('RECORD', result, this.pos, this.line, this.column); | |
default: | |
return new Token('KEYWORD', result, this.pos, this.line, this.column); | |
} | |
} else { | |
return new Token('IDENTIFIER', result, this.pos, this.line, this.column); | |
} | |
} | |
private string(): Token { | |
let result = ''; | |
// Skip the opening quote | |
this.advance(); | |
while (this.currentChar !== null && this.currentChar !== '\'') { | |
result += this.currentChar; | |
this.advance(); | |
} | |
// Skip the closing quote | |
this.advance(); | |
return new Token('STRING', result, this.pos, this.line, this.column); | |
} | |
private isOperator(char: string): boolean { | |
return ['+', '-', '*', '/', ':=', '=', '<', '>', '<=', '>=', '<>'].includes(char); | |
} | |
private operator(): Token { | |
let token: Token; | |
switch (this.currentChar) { | |
case '+': | |
case '-': | |
case '*': | |
case '/': | |
case '=': | |
token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column); | |
this.advance(); | |
break; | |
case ':': | |
if (this.peek() === '=') { | |
token = new Token('OPERATOR', ':=', this.pos, this.line, this.column); | |
this.advance(); // Skip ':' | |
this.advance(); // Skip '=' | |
} else { | |
token = new Token('DELIMITER', ':', this.pos, this.line, this.column); | |
this.advance(); | |
} | |
break; | |
case '<': | |
case '>': | |
if (this.peek() === '=') { | |
token = new Token('OPERATOR', this.currentChar + '=', this.pos, this.line, this.column); | |
this.advance(); // Skip '<' or '>' | |
this.advance(); // Skip '=' | |
} else if (this.currentChar === '<' && this.peek() === '>') { | |
token = new Token('OPERATOR', '<>', this.pos, this.line, this.column); | |
this.advance(); // Skip '<' | |
this.advance(); // Skip '>' | |
} else { | |
token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column); | |
this.advance(); | |
} | |
break; | |
default: | |
token = new Token('UNKNOWN', this.currentChar || '', this.pos, this.line, this.column); | |
this.advance(); | |
} | |
return token; | |
} | |
private delimiter(): Token { | |
const value = this.currentChar; | |
this.advance(); | |
return new Token('DELIMITER', value || '', this.pos, this.line, this.column); | |
} | |
public getNextToken(): Token { | |
while (this.currentChar !== null) { | |
if (/\s/.test(this.currentChar)) { | |
this.skipWhitespace(); | |
continue; | |
} | |
if (this.currentChar === '{') { | |
this.advance(); | |
this.skipComment(); | |
continue; | |
} | |
if (/\d/.test(this.currentChar)) { | |
return this.number(); | |
} | |
if (/[a-zA-Z_]/.test(this.currentChar)) { | |
return this.identifier(); | |
} | |
if (this.currentChar === '\'') { | |
return this.string(); | |
} | |
if (this.isOperator(this.currentChar)) { | |
return this.operator(); | |
} | |
if ([';', ':', ',', '.', '(', ')'].includes(this.currentChar)) { | |
return this.delimiter(); | |
} | |
this.error(); | |
} | |
return new Token('EOF', '', this.pos, this.line, this.column); | |
} | |
} | |
// Example usage: | |
const sourceCode = | |
` program Sample; | |
var number: integer; | |
begin | |
number := 10 mod 3; | |
if number = 1 then | |
writeln('Result is one') | |
else | |
writeln('Result is not one'); | |
number := number div 2; | |
number := number xor 1; | |
end. | |
`; | |
const lexer = new Lexer(sourceCode); | |
let token: Token; | |
try { | |
do { | |
token = lexer.getNextToken(); | |
console.log(token); | |
} while (token.type !== 'EOF'); | |
} catch (error) { | |
console.error(error); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a very very basic lexer for only a small subset of the pascal language, but it can be a basis for an more advanced lexer.