Skip to content

Instantly share code, notes, and snippets.

@rcampbell
Created May 5, 2011 10:12
Show Gist options
  • Save rcampbell/956824 to your computer and use it in GitHub Desktop.
Save rcampbell/956824 to your computer and use it in GitHub Desktop.
Lexical Analyzer for Scheme R5RS in JavaScript
//
// Lexical Analyzer for Scheme R5RS
//
// TODO: numbers, better error reporting
//
// Depends on Functional Javascript: http://osteele.com/sources/javascript/functional/
//
Functional.install();
String.prototype.tokenize = (function() {
var expressionKeywords = ['quote', 'lambda', 'if', 'set!', 'begin', 'cond', 'and', 'or', 'case', 'let', 'let*', 'letrec', 'do', 'delay', 'quasiquote'];
var syntacticKeywords = ['else', '=>', 'define', 'unquote', 'unquote-splicing'].concat(expressionKeywords);
var identifierPattern = /^((?:\+|-|\.\.\.)|[0-9A-Za-z!\$%&\*/:\<=\>\?\^_~\+-\.@]*)[ \n\(\)";]/;
var tokens = [{pattern:/^( )+/, type:'space'}
,{pattern:/^(\n)+/, type:'newline'}
,{pattern:/^(;.*)/, type:'comment'}
,{pattern:/^(\()/, type:'lparen', parse:I}
,{pattern:/^(\))/, type:'rparen', parse:I}
,{pattern:/^([0-9]+)[ \n\(\)";]/, type:'number', parse:function(x){ return parseInt(x); }}
,{pattern:/^(#t)/, type:'boolean', parse:K(true)}
,{pattern:/^(#f)/, type:'boolean', parse:K(false)}
,{pattern:/^(#\\space)[ \n\(\)";]/, type:'character', parse:K(' ')}
,{pattern:/^(#\\newline)[ \n\(\)";]/, type:'character', parse:K('\n')}
,{pattern:/^(#\\.)[ \n\(\)";]/, type:'character', parse:'x.substring(2)'.lambda()}
,{pattern:/^("(?:[^"\\]|\\"|\\\\)*")/, type:'string', parse:'x.substring(1, x.length - 1)'.lambda()}
,{pattern:identifierPattern, type:'identifier', parse:I, identify:true}
];
var getErrorSnippet = function(program) {
var MAX = 10;
var snippet = /\S+/.exec(program);
return (snippet && snippet[0] && snippet[0].substring(0, MAX)) || '';
};
return function() {
var program = this ? this + ' ' : false; // EOF delimiter for implicit termination
var output = (function() {
var symbols = {};
var lexemes = [];
return {
addSymbol: function(lexeme) {
if (symbols[lexeme]) {
// noop
} else {
symbols[lexeme] = {
type: some('==="'+lexeme+'"', syntacticKeywords) ? 'system' : 'user'
};
}
},
symbols: function() {
return symbols;
},
addLexeme: function(lexeme) {
lexemes.push(lexeme);
},
lexemes: function() {
return lexemes;
}
};
}());
var begin = 0, end = 0, line = 1, character = 0;
var i, token, match, lexeme, error;
var make = function() {
var instance = Object.create(token);
instance.value = token.parse(lexeme);
instance.line = line;
instance.begin = begin;
instance.end = end;
return instance;
};
while (program) {
for (i = 0; i < tokens.length; i += 1) {
token = tokens[i];
match = token.pattern.exec(program);
if (match) break;
}
if (match) {
lexeme = match[1];
end += lexeme.length;
if (lexeme === '\n') {
line += 1;
character = 0;
}
if (token.parse) {
output.addLexeme(make());
}
if (token.identify) {
output.addSymbol(lexeme);
}
begin += lexeme.length;
character += lexeme.length;
program = program.substring(lexeme.length);
} else {
error = getErrorSnippet(program);
throw {
name: 'UnknownToken',
message: 'Unknown token starting on line ' + line + ', character ' + character + (error ? (': ' + error) : '.')
};
}
}
console.log(map(function(x){ return x.begin + '-' + x.end + ':' + x.type + ':' + x.value; }, output.lexemes()));
return output;
};
}());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment