Skip to content

Instantly share code, notes, and snippets.

@erdesigns-eu
Last active November 10, 2023 13:49
Show Gist options
  • Save erdesigns-eu/5c9064a99d4f9b0b05cfb53a9fee2b11 to your computer and use it in GitHub Desktop.
Save erdesigns-eu/5c9064a99d4f9b0b05cfb53a9fee2b11 to your computer and use it in GitHub Desktop.
Pascal Lexer implemented in Typescript
type TokenType = 'KEYWORD' | 'IDENTIFIER' | 'NUMBER' | 'STRING' | 'OPERATOR' | 'DELIMITER' | 'EOF' | 'UNKNOWN' | 'PROCEDURE' | 'FUNCTION' | 'CONSTANT' | 'RECORD';
class Token {
constructor(public type: TokenType, public value: string, public position: number, public line: number, public column: number) {}
}
class Lexer {
private pos: number = 0;
private line: number = 1;
private column: number = 0;
private currentChar: string | null = this.text[this.pos];
constructor(private text: string) {
this.advance();
}
private peek(): string | null {
const peekPos = this.pos + 1;
if (peekPos < this.text.length) {
return this.text[peekPos];
} else {
return null;
}
}
private advance(): void {
if (this.currentChar === '\n') {
this.line++;
this.column = 0;
} else {
this.column++;
}
this.pos++;
this.currentChar = this.pos < this.text.length ? this.text[this.pos] : null;
}
private error(): never {
throw new Error(`Lexer error at position ${this.pos} (Line: ${this.line}, Column: ${this.column}): Unexpected character '${this.currentChar}'`);
}
private skipWhitespace(): void {
while (this.currentChar !== null && /\s/.test(this.currentChar)) {
this.advance();
}
}
private skipComment(): void {
while (this.currentChar !== '}') {
this.advance();
if (this.currentChar === null) {
this.error();
}
}
// Skip the closing curly brace
this.advance();
}
private number(): Token {
let result = '';
while (this.currentChar !== null && /\d/.test(this.currentChar)) {
result += this.currentChar;
this.advance();
}
if (this.currentChar === '.') {
result += this.currentChar;
this.advance();
while (this.currentChar !== null && /\d/.test(this.currentChar)) {
result += this.currentChar;
this.advance();
}
}
// Handle exponent part
if (this.currentChar?.toLowerCase() === 'e') {
result += this.currentChar;
this.advance();
if (this.currentChar === '+' || this.currentChar === '-') {
result += this.currentChar;
this.advance();
}
while (this.currentChar !== null && /\d/.test(this.currentChar)) {
result += this.currentChar;
this.advance();
}
}
return new Token('NUMBER', result, this.pos, this.line, this.column);
}
private identifier(): Token {
let result = '';
while (this.currentChar !== null && /[a-zA-Z_]/.test(this.currentChar)) {
result += this.currentChar;
this.advance();
}
const operatorWords = new Set(['mod', 'div', 'and', 'or', 'not', 'xor']);
const isFollowedByOperatorChar = this.currentChar === null || /[\s;:,.()\[\]]/.test(this.currentChar);
if (operatorWords.has(result.toLowerCase()) && isFollowedByOperatorChar) {
return new Token('OPERATOR', result, this.pos, this.line, this.column);
}
const keywords = new Set([
'program', 'var', 'begin', 'end', 'if', 'then', 'else', 'while', 'do', 'for', 'to', 'procedure', 'function', 'const', 'record'
]);
if (keywords.has(result.toLowerCase())) {
switch (result.toLowerCase()) {
case 'procedure':
return new Token('PROCEDURE', result, this.pos, this.line, this.column);
case 'function':
return new Token('FUNCTION', result, this.pos, this.line, this.column);
case 'const':
return new Token('CONSTANT', result, this.pos, this.line, this.column);
case 'record':
return new Token('RECORD', result, this.pos, this.line, this.column);
default:
return new Token('KEYWORD', result, this.pos, this.line, this.column);
}
} else {
return new Token('IDENTIFIER', result, this.pos, this.line, this.column);
}
}
private string(): Token {
let result = '';
// Skip the opening quote
this.advance();
while (this.currentChar !== null && this.currentChar !== '\'') {
result += this.currentChar;
this.advance();
}
// Skip the closing quote
this.advance();
return new Token('STRING', result, this.pos, this.line, this.column);
}
private isOperator(char: string): boolean {
return ['+', '-', '*', '/', ':=', '=', '<', '>', '<=', '>=', '<>'].includes(char);
}
private operator(): Token {
let token: Token;
switch (this.currentChar) {
case '+':
case '-':
case '*':
case '/':
case '=':
token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column);
this.advance();
break;
case ':':
if (this.peek() === '=') {
token = new Token('OPERATOR', ':=', this.pos, this.line, this.column);
this.advance(); // Skip ':'
this.advance(); // Skip '='
} else {
token = new Token('DELIMITER', ':', this.pos, this.line, this.column);
this.advance();
}
break;
case '<':
case '>':
if (this.peek() === '=') {
token = new Token('OPERATOR', this.currentChar + '=', this.pos, this.line, this.column);
this.advance(); // Skip '<' or '>'
this.advance(); // Skip '='
} else if (this.currentChar === '<' && this.peek() === '>') {
token = new Token('OPERATOR', '<>', this.pos, this.line, this.column);
this.advance(); // Skip '<'
this.advance(); // Skip '>'
} else {
token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column);
this.advance();
}
break;
default:
token = new Token('UNKNOWN', this.currentChar || '', this.pos, this.line, this.column);
this.advance();
}
return token;
}
private delimiter(): Token {
const value = this.currentChar;
this.advance();
return new Token('DELIMITER', value || '', this.pos, this.line, this.column);
}
public getNextToken(): Token {
while (this.currentChar !== null) {
if (/\s/.test(this.currentChar)) {
this.skipWhitespace();
continue;
}
if (this.currentChar === '{') {
this.advance();
this.skipComment();
continue;
}
if (/\d/.test(this.currentChar)) {
return this.number();
}
if (/[a-zA-Z_]/.test(this.currentChar)) {
return this.identifier();
}
if (this.currentChar === '\'') {
return this.string();
}
if (this.isOperator(this.currentChar)) {
return this.operator();
}
if ([';', ':', ',', '.', '(', ')'].includes(this.currentChar)) {
return this.delimiter();
}
this.error();
}
return new Token('EOF', '', this.pos, this.line, this.column);
}
}
// Example usage:
const sourceCode =
` program Sample;
var number: integer;
begin
number := 10 mod 3;
if number = 1 then
writeln('Result is one')
else
writeln('Result is not one');
number := number div 2;
number := number xor 1;
end.
`;
const lexer = new Lexer(sourceCode);
let token: Token;
try {
do {
token = lexer.getNextToken();
console.log(token);
} while (token.type !== 'EOF');
} catch (error) {
console.error(error);
}
@erdesigns-eu
Copy link
Author

This is a very very basic lexer for only a small subset of the pascal language, but it can be a basis for an more advanced lexer.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment