-
-
Save conartist6/71fd8e40f34a6a2267d4fd6990cb686c to your computer and use it in GitHub Desktop.
import parserate from '@iter-tools/parserate'; | |
const t = { | |
token: (value) => ({ type: 'token', value }), | |
literal: (value) => ({ type: 'literal', value }), | |
}; | |
const escapes = { | |
'"': '"', | |
'\\': '\\', | |
b: '\b', | |
f: '\f', | |
n: '\n', | |
r: '\r', | |
t: '\t', | |
}; | |
const escapeChars = Object.keys(escapes); | |
function* tokenizeString(parsr) { | |
let literal = ''; | |
parsr.take('"'); | |
while(!parsr.done && !parsr.match('"')) { | |
if (parsr.takeMatch('\\')) { | |
if (parsr.takeMatch('u')) { | |
const [charCode] = parsr.take(/\d{4}/); | |
literal += String.fromCharCode(parseInt(charCode, 16)); | |
} else if (parsr.takeMatch(escapeChars)) { | |
literal += escapes[parsr.match[0]]; | |
} else if (parsr.takeMatch(/./)) { | |
literal += parsr.match[0]; | |
} else { | |
parsr.error(); | |
} | |
} else if (parsr.takeMatch(/./)) { | |
literal += parsr.match[0]; | |
} else { | |
parsr.error(); | |
} | |
} | |
parsr.take('"'); | |
yield t.literal(literal); | |
} | |
export function* tokenize(input) { | |
const parsr = parserate(input); | |
while (!parsr.done) { | |
if (parsr.takeMatch('null')) { | |
yield t.literal(null); | |
} else if (parsr.match('"')) { | |
yield* tokenizeString(parsr); | |
} else if (parsr.takeMatch(['[', ']', '{', '}', ':', ','])) { | |
yield t.token(parsr.match[0]); | |
} else if (parsr.takeMatch(/\s+/)) { | |
} else { | |
throw parsr.error(); | |
} | |
} | |
} | |
function parseValue(parsr) { | |
while(!parsr.done) { | |
const token = parsr.value; | |
switch(token.type) { | |
case 'literal': | |
return token.value; | |
case 'token': | |
if (parsr.takeMatch('{')) { | |
const obj = {}; | |
while (!parsr.done && !parsr.match('}')) { | |
const [key] = parsr; | |
if (key.type !== 'literal') parsr.error(key); | |
parsr.take(':'); | |
obj[key.value] = parseValue(parsr); | |
if (!parsr.takeMatch(',')) break; | |
} | |
parsr.take('}'); | |
return obj; | |
} else if (parsr.takeMatch('[')) { | |
const arr = []; | |
while (!parsr.done && !parsr.match(']')) { | |
arr.push(parseValue(parsr)); | |
if (!parsr.takeMatch(',')) break; | |
} | |
parsr.take(']'); | |
return arr; | |
} | |
} | |
} | |
} | |
export function parse(input) { | |
return parseValue(parserate(tokenize(input))); | |
} | |
export function* parseStream(input) { | |
const parsr = parserate(tokenize(input)); | |
parsr.take('['); | |
while (!parsr.done && !parsr.match(']')) { | |
yield parseValue(parsr); | |
if (!parsr.takeMatch(',')) break; | |
} | |
parsr.take(']'); | |
} |
The core idea is that parserate
functions almost like a coroutine for parsing. It tracks a lot of state for you as you work, for example everything you've tried to match()
since the last advance()
. parsr.error()
can then use the data to print a polite error like:
SyntaxError: Unexpected token at Line 2, Column 14
expectedTokens: [ '{', '[', /\d/, 'null' ]
Note: while I've done this all with sync generators for brevity, a more realistic use case for a streaming parser would be to work with async inputs. To handle async inputs you would simply use async generators, for await .. of
loops, and asyncParserate
.
If a sync and an async version of a parser is needed that would be done with code generation tooling. I am working on shipping macrome
, which has evolved from the tooling I created to solve just this problem in iter-tools
.
Some things I think are particularly compelling about writing a parser like this:
- Top-down control means highly useful stack traces. A smaller number of more powerful functions can be on the stack, where each function can represent a meaningful parsing context.
- Parser authors can trivially create syntax errors when conditions are unexpected, and errors will be thrown very close to the real problems -- either the conditions which the author chose to treat as errors or just programming errors like a null pointers.
- It is easy to create a syntax which is dynamic, for example because it is possible to use configuration to determine some aspects of the language. Syntax errors will also adapt correctly.
- It is fairly trivial to implement a combination of a tokenizer and a parser, which can make for better error messages. In our JSON parser example it might allow a commonly reported error to be displayed as:
SyntaxError: Unexpected token: 'undefined'
instead ofSyntaxError: Unexpected character: 'u'
. - It should be fairly easy to handle ambiguous syntaxes. Multiple forks can be made and passed to different functions each parsing the input experimentally. The priority of the ambiguous interpretations should also be clear when expressed this way.
- When the input is a large (but can be streamed) the parser will avoid having to hold the entire input in memory (or swap) at any one time. The parser may be able to have even lower memory usage if it also emits its results in a streaming fashion.
Note: The original version of this code predates the implementation of
@iter-tools/parserate
and was used to consider the API that it should have