erdesigns-eu/lexer.ts

## lexer.ts
type TokenType = 'KEYWORD' | 'IDENTIFIER' | 'NUMBER' | 'STRING' | 'OPERATOR' | 'DELIMITER' | 'EOF' | 'UNKNOWN' | 'PROCEDURE' | 'FUNCTION' | 'CONSTANT' | 'RECORD';

class Token {
  constructor(public type: TokenType, public value: string, public position: number, public line: number, public column: number) {}
}

class Lexer {
  private pos: number = 0;
  private line: number = 1;
  private column: number = 0;
  private currentChar: string | null = this.text[this.pos];

  constructor(private text: string) {
    this.advance();
  }

  private peek(): string | null {
    const peekPos = this.pos + 1;
    if (peekPos < this.text.length) {
      return this.text[peekPos];
    } else {
      return null;
    }
  }

  private advance(): void {
    if (this.currentChar === '\n') {
      this.line++;
      this.column = 0;
    } else {
      this.column++;
    }

    this.pos++;
    this.currentChar = this.pos < this.text.length ? this.text[this.pos] : null;
  }

  private error(): never {
    throw new Error(`Lexer error at position ${this.pos} (Line: ${this.line}, Column: ${this.column}): Unexpected character '${this.currentChar}'`);
  }

  private skipWhitespace(): void {
    while (this.currentChar !== null && /\s/.test(this.currentChar)) {
      this.advance();
    }
  }

  private skipComment(): void {
    while (this.currentChar !== '}') {
      this.advance();
      if (this.currentChar === null) {
        this.error();
      }
    }
    // Skip the closing curly brace
    this.advance();
  }

  private number(): Token {
    let result = '';
    while (this.currentChar !== null && /\d/.test(this.currentChar)) {
      result += this.currentChar;
      this.advance();
    }

    if (this.currentChar === '.') {
      result += this.currentChar;
      this.advance();

      while (this.currentChar !== null && /\d/.test(this.currentChar)) {
        result += this.currentChar;
        this.advance();
      }
    }

    // Handle exponent part
    if (this.currentChar?.toLowerCase() === 'e') {
      result += this.currentChar;
      this.advance();

      if (this.currentChar === '+' || this.currentChar === '-') {
        result += this.currentChar;
        this.advance();
      }

      while (this.currentChar !== null && /\d/.test(this.currentChar)) {
        result += this.currentChar;
        this.advance();
      }
    }

    return new Token('NUMBER', result, this.pos, this.line, this.column);
  }

  private identifier(): Token {
    let result = '';
    while (this.currentChar !== null && /[a-zA-Z_]/.test(this.currentChar)) {
      result += this.currentChar;
      this.advance();
    }

    const operatorWords = new Set(['mod', 'div', 'and', 'or', 'not', 'xor']);
    const isFollowedByOperatorChar = this.currentChar === null || /[\s;:,.()\[\]]/.test(this.currentChar);

    if (operatorWords.has(result.toLowerCase()) && isFollowedByOperatorChar) {
      return new Token('OPERATOR', result, this.pos, this.line, this.column);
    }

    const keywords = new Set([
      'program', 'var', 'begin', 'end', 'if', 'then', 'else', 'while', 'do', 'for', 'to', 'procedure', 'function', 'const', 'record'
    ]);

    if (keywords.has(result.toLowerCase())) {
      switch (result.toLowerCase()) {
        case 'procedure':
          return new Token('PROCEDURE', result, this.pos, this.line, this.column);
        case 'function':
          return new Token('FUNCTION', result, this.pos, this.line, this.column);
        case 'const':
          return new Token('CONSTANT', result, this.pos, this.line, this.column);
        case 'record':
          return new Token('RECORD', result, this.pos, this.line, this.column);
        default:
          return new Token('KEYWORD', result, this.pos, this.line, this.column);
      }
    } else {
      return new Token('IDENTIFIER', result, this.pos, this.line, this.column);
    }
  }

  private string(): Token {
    let result = '';
    // Skip the opening quote
    this.advance();

    while (this.currentChar !== null && this.currentChar !== '\'') {
      result += this.currentChar;
      this.advance();
    }

    // Skip the closing quote
    this.advance();

    return new Token('STRING', result, this.pos, this.line, this.column);
  }

  private isOperator(char: string): boolean {
    return ['+', '-', '*', '/', ':=', '=', '<', '>', '<=', '>=', '<>'].includes(char);
  }

  private operator(): Token {
    let token: Token;
    switch (this.currentChar) {
      case '+':
      case '-':
      case '*':
      case '/':
      case '=':
        token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column);
        this.advance();
        break;
      case ':':
        if (this.peek() === '=') {
          token = new Token('OPERATOR', ':=', this.pos, this.line, this.column);
          this.advance(); // Skip ':'
          this.advance(); // Skip '='
        } else {
          token = new Token('DELIMITER', ':', this.pos, this.line, this.column);
          this.advance();
        }
        break;
      case '<':
      case '>':
        if (this.peek() === '=') {
          token = new Token('OPERATOR', this.currentChar + '=', this.pos, this.line, this.column);
          this.advance(); // Skip '<' or '>'
          this.advance(); // Skip '='
        } else if (this.currentChar === '<' && this.peek() === '>') {
          token = new Token('OPERATOR', '<>', this.pos, this.line, this.column);
          this.advance(); // Skip '<'
          this.advance(); // Skip '>'
        } else {
          token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column);
          this.advance();
        }
        break;
      default:
        token = new Token('UNKNOWN', this.currentChar || '', this.pos, this.line, this.column);
        this.advance();
    }
    return token;
  }

  private delimiter(): Token {
    const value = this.currentChar;
    this.advance();
    return new Token('DELIMITER', value || '', this.pos, this.line, this.column);
  }

  public getNextToken(): Token {
    while (this.currentChar !== null) {
      if (/\s/.test(this.currentChar)) {
        this.skipWhitespace();
        continue;
      }

      if (this.currentChar === '{') {
        this.advance();
        this.skipComment();
        continue;
      }

      if (/\d/.test(this.currentChar)) {
        return this.number();
      }

      if (/[a-zA-Z_]/.test(this.currentChar)) {
        return this.identifier();
      }

      if (this.currentChar === '\'') {
        return this.string();
      }

      if (this.isOperator(this.currentChar)) {
        return this.operator();
      }

      if ([';', ':', ',', '.', '(', ')'].includes(this.currentChar)) {
        return this.delimiter();
      }

      this.error();
    }

    return new Token('EOF', '', this.pos, this.line, this.column);
  }
}

// Example usage:
const sourceCode =
` program Sample;
var number: integer;
begin
  number := 10 mod 3;
  if number = 1 then
    writeln('Result is one')
  else
    writeln('Result is not one');
  number := number div 2;
  number := number xor 1;
end.
`;

const lexer = new Lexer(sourceCode);
let token: Token;

try {
  do {
    token = lexer.getNextToken();
    console.log(token);
  } while (token.type !== 'EOF');
} catch (error) {
  console.error(error);
}
	type TokenType = 'KEYWORD' \| 'IDENTIFIER' \| 'NUMBER' \| 'STRING' \| 'OPERATOR' \| 'DELIMITER' \| 'EOF' \| 'UNKNOWN' \| 'PROCEDURE' \| 'FUNCTION' \| 'CONSTANT' \| 'RECORD';

	class Token {
	constructor(public type: TokenType, public value: string, public position: number, public line: number, public column: number) {}
	}

	class Lexer {
	private pos: number = 0;
	private line: number = 1;
	private column: number = 0;
	private currentChar: string \| null = this.text[this.pos];

	constructor(private text: string) {
	this.advance();
	}

	private peek(): string \| null {
	const peekPos = this.pos + 1;
	if (peekPos < this.text.length) {
	return this.text[peekPos];
	} else {
	return null;
	}
	}

	private advance(): void {
	if (this.currentChar === '\n') {
	this.line++;
	this.column = 0;
	} else {
	this.column++;
	}

	this.pos++;
	this.currentChar = this.pos < this.text.length ? this.text[this.pos] : null;
	}

	private error(): never {
	throw new Error(`Lexer error at position ${this.pos} (Line: ${this.line}, Column: ${this.column}): Unexpected character '${this.currentChar}'`);
	}

	private skipWhitespace(): void {
	while (this.currentChar !== null && /\s/.test(this.currentChar)) {
	this.advance();
	}
	}

	private skipComment(): void {
	while (this.currentChar !== '}') {
	this.advance();
	if (this.currentChar === null) {
	this.error();
	}
	}
	// Skip the closing curly brace
	this.advance();
	}

	private number(): Token {
	let result = '';
	while (this.currentChar !== null && /\d/.test(this.currentChar)) {
	result += this.currentChar;
	this.advance();
	}

	if (this.currentChar === '.') {
	result += this.currentChar;
	this.advance();

	while (this.currentChar !== null && /\d/.test(this.currentChar)) {
	result += this.currentChar;
	this.advance();
	}
	}

	// Handle exponent part
	if (this.currentChar?.toLowerCase() === 'e') {
	result += this.currentChar;
	this.advance();

	if (this.currentChar === '+' \|\| this.currentChar === '-') {
	result += this.currentChar;
	this.advance();
	}

	while (this.currentChar !== null && /\d/.test(this.currentChar)) {
	result += this.currentChar;
	this.advance();
	}
	}

	return new Token('NUMBER', result, this.pos, this.line, this.column);
	}

	private identifier(): Token {
	let result = '';
	while (this.currentChar !== null && /[a-zA-Z_]/.test(this.currentChar)) {
	result += this.currentChar;
	this.advance();
	}

	const operatorWords = new Set(['mod', 'div', 'and', 'or', 'not', 'xor']);
	const isFollowedByOperatorChar = this.currentChar === null \|\| /[\s;:,.()\[\]]/.test(this.currentChar);

	if (operatorWords.has(result.toLowerCase()) && isFollowedByOperatorChar) {
	return new Token('OPERATOR', result, this.pos, this.line, this.column);
	}

	const keywords = new Set([
	'program', 'var', 'begin', 'end', 'if', 'then', 'else', 'while', 'do', 'for', 'to', 'procedure', 'function', 'const', 'record'
	]);

	if (keywords.has(result.toLowerCase())) {
	switch (result.toLowerCase()) {
	case 'procedure':
	return new Token('PROCEDURE', result, this.pos, this.line, this.column);
	case 'function':
	return new Token('FUNCTION', result, this.pos, this.line, this.column);
	case 'const':
	return new Token('CONSTANT', result, this.pos, this.line, this.column);
	case 'record':
	return new Token('RECORD', result, this.pos, this.line, this.column);
	default:
	return new Token('KEYWORD', result, this.pos, this.line, this.column);
	}
	} else {
	return new Token('IDENTIFIER', result, this.pos, this.line, this.column);
	}
	}

	private string(): Token {
	let result = '';
	// Skip the opening quote
	this.advance();

	while (this.currentChar !== null && this.currentChar !== '\'') {
	result += this.currentChar;
	this.advance();
	}

	// Skip the closing quote
	this.advance();

	return new Token('STRING', result, this.pos, this.line, this.column);
	}

	private isOperator(char: string): boolean {
	return ['+', '-', '*', '/', ':=', '=', '<', '>', '<=', '>=', '<>'].includes(char);
	}

	private operator(): Token {
	let token: Token;
	switch (this.currentChar) {
	case '+':
	case '-':
	case '*':
	case '/':
	case '=':
	token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column);
	this.advance();
	break;
	case ':':
	if (this.peek() === '=') {
	token = new Token('OPERATOR', ':=', this.pos, this.line, this.column);
	this.advance(); // Skip ':'
	this.advance(); // Skip '='
	} else {
	token = new Token('DELIMITER', ':', this.pos, this.line, this.column);
	this.advance();
	}
	break;
	case '<':
	case '>':
	if (this.peek() === '=') {
	token = new Token('OPERATOR', this.currentChar + '=', this.pos, this.line, this.column);
	this.advance(); // Skip '<' or '>'
	this.advance(); // Skip '='
	} else if (this.currentChar === '<' && this.peek() === '>') {
	token = new Token('OPERATOR', '<>', this.pos, this.line, this.column);
	this.advance(); // Skip '<'
	this.advance(); // Skip '>'
	} else {
	token = new Token('OPERATOR', this.currentChar, this.pos, this.line, this.column);
	this.advance();
	}
	break;
	default:
	token = new Token('UNKNOWN', this.currentChar \|\| '', this.pos, this.line, this.column);
	this.advance();
	}
	return token;
	}

	private delimiter(): Token {
	const value = this.currentChar;
	this.advance();
	return new Token('DELIMITER', value \|\| '', this.pos, this.line, this.column);
	}

	public getNextToken(): Token {
	while (this.currentChar !== null) {
	if (/\s/.test(this.currentChar)) {
	this.skipWhitespace();
	continue;
	}

	if (this.currentChar === '{') {
	this.advance();
	this.skipComment();
	continue;
	}

	if (/\d/.test(this.currentChar)) {
	return this.number();
	}

	if (/[a-zA-Z_]/.test(this.currentChar)) {
	return this.identifier();
	}

	if (this.currentChar === '\'') {
	return this.string();
	}

	if (this.isOperator(this.currentChar)) {
	return this.operator();
	}

	if ([';', ':', ',', '.', '(', ')'].includes(this.currentChar)) {
	return this.delimiter();
	}

	this.error();
	}

	return new Token('EOF', '', this.pos, this.line, this.column);
	}
	}

	// Example usage:
	const sourceCode =
	` program Sample;
	var number: integer;
	begin
	number := 10 mod 3;
	if number = 1 then
	writeln('Result is one')
	else
	writeln('Result is not one');
	number := number div 2;
	number := number xor 1;
	end.
	`;

	const lexer = new Lexer(sourceCode);
	let token: Token;

	try {
	do {
	token = lexer.getNextToken();
	console.log(token);
	} while (token.type !== 'EOF');
	} catch (error) {
	console.error(error);
	}