jameskeane/parser.js

## parser.js
// This code will teach you how to write a simple recursive descent parser.
// Corners will be cut, but by the end you will have the basis of most
// programming languages out there: strings, numbers, identifiers, and
// infix arithmetic expressions with precendence.
// At under 100 lines, I promise it won't take long. Happy Hacking!
//   -- James Keane
//
// First the source code has to be tokenized, for the parser.
// Let's do five tokens, and ignore whitespace:
//   1. operators (+, -, *, etc.)
//   2. identifiers (variable_name, funcName, etc.)
//   3. numbers (1234, 3.1459, 0.0, etc)
//   4. strings ('hello', etc.)
//   5. control flow ({, }, ;, etc)
tokenize = (s) => (s === '') ? [] : (
  s.split(/\s*([-+*\/\%=\(\)]|[A-Za-z_][A-Za-z0-9_]*|[0-9]*\.?[0-9]+|\'[^\']*\'|[\;\}])\s*/g)
   .filter((s) => !s.match(/^\s*$/))
);
// A better tokenizer would keep track of line and column positions, stream
// symbols or objects with meaning so that we wouldn't need to repeat ourselves.
//
// Now let's repeat ourselves and start building the parser rules bottom up.
let t = [], i = 0;
// Starting simple, we'll match basic tokens that are direct AST values:
string = () => /^\'[^\']*\'$/.test(t[i]) ?
    { type: 'string', value: t[i].slice(1, t[i++].length - 1) } : paren();
number = () => /^[0-9]*(\.[0-9]+)?$/.test(t[i]) ?
    { type: 'number', value: parseInt(t[i++], 10) } : string();
identifier = () => /^[a-zA-Z_][a-zA-Z_0-9]*$/.test(t[i]) ?
    { type: 'identifier', ident: t[i++] } : number();
// Notice how each rule tries to match it's corresponding token regex, and if
// it fails it calls the 'up' rule? That's recursive descent; we're just writing
// it 'upside down' because it's easier that way.
// In the `string` rule, you might've noticed that it's calling into a missing
// symbol `paren`? Remember that, it's important; it will be defined later.
//
// The basic idea behind recursive descent is to parse the source top down, i.e.
// we have a `program` => `statement`[] => `ifstmt` | `funcdef` | (`expression` + ';')
// etc.
// Let continue defining our expressions, with some mathematical operators.
// First though let's make a small helper since they all work pretty much the
// same way. Inside the helper, there is a subtle trick that handles operator
// precendence: the lhs calls down, but the rhs calls it self.
// This is the kind of flexibility that you just don't get with a parser
// generator.
op = (sym, lhs) => () => {
  const l = lhs();
  return (t[i] !== sym) ? l :
      { type: 'operator', lhs: l, op: t[i++], rhs: op(sym, lhs)() };
};
// now for the operators, in (reverse) precedence order
divide = op('/', identifier);
modulo = op('%', divide)
multiply = op('*', modulo);
minus = op('-', multiply);
add = op('+', minus);
// Let's stop for a second and walk through how we just did that. Consider
// `a * 3 + b` and think about what will happen if we call `add`. Don't worry if
// it takes a while to think it through, this is the hardest part.
//
// The final piece is `paren`, it ties it up from the beginning! and allows for
// expressions to be wrapped by parentheses to control precedence
expression = add;
paren = () => {
  if (t[i] !== '(') return null;
  i++;
  const next = expression();
  if (t[i++] !== ')') throw Error(`Unclosed parentheses at position ${i}.`);
  return next;
};
// all that's left is the parser entrypoint
module.exports = parse = (source) => {
  i = 0, t = tokenize(source);
  return expression();
};
// proof is in the pudding
console.log( parse(`a * 3 + b`) );
	// This code will teach you how to write a simple recursive descent parser.
	// Corners will be cut, but by the end you will have the basis of most
	// programming languages out there: strings, numbers, identifiers, and
	// infix arithmetic expressions with precendence.
	// At under 100 lines, I promise it won't take long. Happy Hacking!
	// -- James Keane
	//
	// First the source code has to be tokenized, for the parser.
	// Let's do five tokens, and ignore whitespace:
	// 1. operators (+, -, *, etc.)
	// 2. identifiers (variable_name, funcName, etc.)
	// 3. numbers (1234, 3.1459, 0.0, etc)
	// 4. strings ('hello', etc.)
	// 5. control flow ({, }, ;, etc)
	tokenize = (s) => (s === '') ? [] : (
	s.split(/\s([-+\/\%=\(\)]\|[A-Za-z_][A-Za-z0-9_]\|[0-9]\.?[0-9]+\|\'[^\']\'\|[\;\}])\s/g)
	.filter((s) => !s.match(/^\s*$/))
	);
	// A better tokenizer would keep track of line and column positions, stream
	// symbols or objects with meaning so that we wouldn't need to repeat ourselves.
	//
	// Now let's repeat ourselves and start building the parser rules bottom up.
	let t = [], i = 0;
	// Starting simple, we'll match basic tokens that are direct AST values:
	string = () => /^\'[^\']*\'$/.test(t[i]) ?
	{ type: 'string', value: t[i].slice(1, t[i++].length - 1) } : paren();
	number = () => /^[0-9]*(\.[0-9]+)?$/.test(t[i]) ?
	{ type: 'number', value: parseInt(t[i++], 10) } : string();
	identifier = () => /^[a-zA-Z_][a-zA-Z_0-9]*$/.test(t[i]) ?
	{ type: 'identifier', ident: t[i++] } : number();
	// Notice how each rule tries to match it's corresponding token regex, and if
	// it fails it calls the 'up' rule? That's recursive descent; we're just writing
	// it 'upside down' because it's easier that way.
	// In the `string` rule, you might've noticed that it's calling into a missing
	// symbol `paren`? Remember that, it's important; it will be defined later.
	//
	// The basic idea behind recursive descent is to parse the source top down, i.e.
	// we have a `program` => `statement`[] => `ifstmt` \| `funcdef` \| (`expression` + ';')
	// etc.
	// Let continue defining our expressions, with some mathematical operators.
	// First though let's make a small helper since they all work pretty much the
	// same way. Inside the helper, there is a subtle trick that handles operator
	// precendence: the lhs calls down, but the rhs calls it self.
	// This is the kind of flexibility that you just don't get with a parser
	// generator.
	op = (sym, lhs) => () => {
	const l = lhs();
	return (t[i] !== sym) ? l :
	{ type: 'operator', lhs: l, op: t[i++], rhs: op(sym, lhs)() };
	};
	// now for the operators, in (reverse) precedence order
	divide = op('/', identifier);
	modulo = op('%', divide)
	multiply = op('*', modulo);
	minus = op('-', multiply);
	add = op('+', minus);
	// Let's stop for a second and walk through how we just did that. Consider
	// `a * 3 + b` and think about what will happen if we call `add`. Don't worry if
	// it takes a while to think it through, this is the hardest part.
	//
	// The final piece is `paren`, it ties it up from the beginning! and allows for
	// expressions to be wrapped by parentheses to control precedence
	expression = add;
	paren = () => {
	if (t[i] !== '(') return null;
	i++;
	const next = expression();
	if (t[i++] !== ')') throw Error(`Unclosed parentheses at position ${i}.`);
	return next;
	};
	// all that's left is the parser entrypoint
	module.exports = parse = (source) => {
	i = 0, t = tokenize(source);
	return expression();
	};
	// proof is in the pudding
	console.log( parse(`a * 3 + b`) );