Skip to content

Instantly share code, notes, and snippets.

@jordanbtucker
Created July 21, 2021 20:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jordanbtucker/441fee538bd222cd6e9be0c061a3792e to your computer and use it in GitHub Desktop.
Save jordanbtucker/441fee538bd222cd6e9be0c061a3792e to your computer and use it in GitHub Desktop.
JSON5 parsing simple references as strings
// see https://github.com/json5/json5/issues/249
const util = require('./util')
let source
let parseState
let stack
let pos
let line
let column
let token
let key
let root
module.exports = function parse (text, reviver) {
source = String(text)
parseState = 'start'
stack = []
pos = 0
line = 1
column = 0
token = undefined
key = undefined
root = undefined
do {
token = lex()
// This code is unreachable.
// if (!parseStates[parseState]) {
// throw invalidParseState()
// }
parseStates[parseState]()
} while (token.type !== 'eof')
if (typeof reviver === 'function') {
return internalize({'': root}, '', reviver)
}
return root
}
function internalize (holder, name, reviver) {
const value = holder[name]
if (value != null && typeof value === 'object') {
for (const key in value) {
const replacement = internalize(value, key, reviver)
if (replacement === undefined) {
delete value[key]
} else {
value[key] = replacement
}
}
}
return reviver.call(holder, name, value)
}
let lexState
let buffer
let doubleQuote
let sign
let c
function lex () {
lexState = 'default'
buffer = ''
doubleQuote = false
sign = 1
for (;;) {
c = peek()
// This code is unreachable.
// if (!lexStates[lexState]) {
// throw invalidLexState(lexState)
// }
const token = lexStates[lexState]()
if (token) {
return token
}
}
}
function peek () {
if (source[pos]) {
return String.fromCodePoint(source.codePointAt(pos))
}
}
function read () {
const c = peek()
if (c === '\n') {
line++
column = 0
} else if (c) {
column += c.length
} else {
column++
}
if (c) {
pos += c.length
}
return c
}
const lexStates = {
default () {
switch (c) {
case '\t':
case '\v':
case '\f':
case ' ':
case '\u00A0':
case '\uFEFF':
case '\n':
case '\r':
case '\u2028':
case '\u2029':
read()
return
case '/':
read()
lexState = 'comment'
return
case undefined:
read()
return newToken('eof')
}
if (util.isSpaceSeparator(c)) {
read()
return
}
// This code is unreachable.
// if (!lexStates[parseState]) {
// throw invalidLexState(parseState)
// }
return lexStates[parseState]()
},
comment () {
switch (c) {
case '*':
read()
lexState = 'multiLineComment'
return
case '/':
read()
lexState = 'singleLineComment'
return
}
throw invalidChar(read())
},
multiLineComment () {
switch (c) {
case '*':
read()
lexState = 'multiLineCommentAsterisk'
return
case undefined:
throw invalidChar(read())
}
read()
},
multiLineCommentAsterisk () {
switch (c) {
case '*':
read()
return
case '/':
read()
lexState = 'default'
return
case undefined:
throw invalidChar(read())
}
read()
lexState = 'multiLineComment'
},
singleLineComment () {
switch (c) {
case '\n':
case '\r':
case '\u2028':
case '\u2029':
read()
lexState = 'default'
return
case undefined:
read()
return newToken('eof')
}
read()
},
value () {
switch (c) {
case '{':
case '[':
return newToken('punctuator', read())
case '-':
case '+':
if (read() === '-') {
sign = -1
}
lexState = 'sign'
return
case '.':
buffer = read()
lexState = 'decimalPointLeading'
return
case '0':
buffer = read()
lexState = 'zero'
return
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
buffer = read()
lexState = 'decimalInteger'
return
case '"':
case "'":
doubleQuote = (read() === '"')
buffer = ''
lexState = 'string'
return
case '$':
case '_':
buffer = read()
lexState = 'identifierName'
return
case '\\':
read()
lexState = 'identifierNameStartEscape'
return
}
if (util.isIdStartChar(c)) {
buffer += read()
lexState = 'identifierName'
return
}
throw invalidChar(read())
},
identifierNameStartEscape () {
if (c !== 'u') {
throw invalidChar(read())
}
read()
const u = unicodeEscape()
switch (u) {
case '$':
case '_':
break
default:
if (!util.isIdStartChar(u)) {
throw invalidIdentifier()
}
break
}
buffer += u
lexState = 'identifierName'
},
identifierName () {
switch (c) {
case '$':
case '_':
case '\u200C':
case '\u200D':
buffer += read()
return
case '\\':
read()
lexState = 'identifierNameEscape'
return
case '.':
if (parseState === 'beforePropertyValue' || parseState === 'beforeArrayValue') {
lexState = 'identifierNameDot'
buffer += read()
return
}
return newToken('identifier', buffer)
}
if (util.isIdContinueChar(c)) {
buffer += read()
return
}
return newToken('identifier', buffer)
},
identifierNameDot () {
switch (c) {
case '$':
case '_':
buffer += read()
lexState = 'identifierName'
return
case '\\':
read()
lexState = 'identifierNameStartEscape'
return
}
if (util.isIdStartChar(c)) {
buffer += read()
lexState = 'identifierName'
return
}
throw invalidChar(read())
},
identifierNameEscape () {
if (c !== 'u') {
throw invalidChar(read())
}
read()
const u = unicodeEscape()
switch (u) {
case '$':
case '_':
case '\u200C':
case '\u200D':
break
default:
if (!util.isIdContinueChar(u)) {
throw invalidIdentifier()
}
break
}
buffer += u
lexState = 'identifierName'
},
sign () {
switch (c) {
case '.':
buffer = read()
lexState = 'decimalPointLeading'
return
case '0':
buffer = read()
lexState = 'zero'
return
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
buffer = read()
lexState = 'decimalInteger'
return
case 'I':
read()
literal('nfinity')
return newToken('numeric', sign * Infinity)
case 'N':
read()
literal('aN')
return newToken('numeric', NaN)
}
throw invalidChar(read())
},
zero () {
switch (c) {
case '.':
buffer += read()
lexState = 'decimalPoint'
return
case 'e':
case 'E':
buffer += read()
lexState = 'decimalExponent'
return
case 'x':
case 'X':
buffer += read()
lexState = 'hexadecimal'
return
}
return newToken('numeric', sign * 0)
},
decimalInteger () {
switch (c) {
case '.':
buffer += read()
lexState = 'decimalPoint'
return
case 'e':
case 'E':
buffer += read()
lexState = 'decimalExponent'
return
}
if (util.isDigit(c)) {
buffer += read()
return
}
return newToken('numeric', sign * Number(buffer))
},
decimalPointLeading () {
if (util.isDigit(c)) {
buffer += read()
lexState = 'decimalFraction'
return
}
throw invalidChar(read())
},
decimalPoint () {
switch (c) {
case 'e':
case 'E':
buffer += read()
lexState = 'decimalExponent'
return
}
if (util.isDigit(c)) {
buffer += read()
lexState = 'decimalFraction'
return
}
return newToken('numeric', sign * Number(buffer))
},
decimalFraction () {
switch (c) {
case 'e':
case 'E':
buffer += read()
lexState = 'decimalExponent'
return
}
if (util.isDigit(c)) {
buffer += read()
return
}
return newToken('numeric', sign * Number(buffer))
},
decimalExponent () {
switch (c) {
case '+':
case '-':
buffer += read()
lexState = 'decimalExponentSign'
return
}
if (util.isDigit(c)) {
buffer += read()
lexState = 'decimalExponentInteger'
return
}
throw invalidChar(read())
},
decimalExponentSign () {
if (util.isDigit(c)) {
buffer += read()
lexState = 'decimalExponentInteger'
return
}
throw invalidChar(read())
},
decimalExponentInteger () {
if (util.isDigit(c)) {
buffer += read()
return
}
return newToken('numeric', sign * Number(buffer))
},
hexadecimal () {
if (util.isHexDigit(c)) {
buffer += read()
lexState = 'hexadecimalInteger'
return
}
throw invalidChar(read())
},
hexadecimalInteger () {
if (util.isHexDigit(c)) {
buffer += read()
return
}
return newToken('numeric', sign * Number(buffer))
},
string () {
switch (c) {
case '\\':
read()
buffer += escape()
return
case '"':
if (doubleQuote) {
read()
return newToken('string', buffer)
}
buffer += read()
return
case "'":
if (!doubleQuote) {
read()
return newToken('string', buffer)
}
buffer += read()
return
case '\n':
case '\r':
throw invalidChar(read())
case '\u2028':
case '\u2029':
separatorChar(c)
break
case undefined:
throw invalidChar(read())
}
buffer += read()
},
start () {
switch (c) {
case '{':
case '[':
return newToken('punctuator', read())
// This code is unreachable since the default lexState handles eof.
// case undefined:
// return newToken('eof')
}
lexState = 'value'
},
beforePropertyName () {
switch (c) {
case '$':
case '_':
buffer = read()
lexState = 'identifierName'
return
case '\\':
read()
lexState = 'identifierNameStartEscape'
return
case '}':
return newToken('punctuator', read())
case '"':
case "'":
doubleQuote = (read() === '"')
lexState = 'string'
return
}
if (util.isIdStartChar(c)) {
buffer += read()
lexState = 'identifierName'
return
}
throw invalidChar(read())
},
afterPropertyName () {
if (c === ':') {
return newToken('punctuator', read())
}
throw invalidChar(read())
},
beforePropertyValue () {
lexState = 'value'
},
afterPropertyValue () {
switch (c) {
case ',':
case '}':
return newToken('punctuator', read())
}
throw invalidChar(read())
},
beforeArrayValue () {
if (c === ']') {
return newToken('punctuator', read())
}
lexState = 'value'
},
afterArrayValue () {
switch (c) {
case ',':
case ']':
return newToken('punctuator', read())
}
throw invalidChar(read())
},
end () {
// This code is unreachable since it's handled by the default lexState.
// if (c === undefined) {
// read()
// return newToken('eof')
// }
throw invalidChar(read())
},
}
function newToken (type, value) {
return {
type,
value,
line,
column,
}
}
function literal (s) {
for (const c of s) {
const p = peek()
if (p !== c) {
throw invalidChar(read())
}
read()
}
}
function escape () {
const c = peek()
switch (c) {
case 'b':
read()
return '\b'
case 'f':
read()
return '\f'
case 'n':
read()
return '\n'
case 'r':
read()
return '\r'
case 't':
read()
return '\t'
case 'v':
read()
return '\v'
case '0':
read()
if (util.isDigit(peek())) {
throw invalidChar(read())
}
return '\0'
case 'x':
read()
return hexEscape()
case 'u':
read()
return unicodeEscape()
case '\n':
case '\u2028':
case '\u2029':
read()
return ''
case '\r':
read()
if (peek() === '\n') {
read()
}
return ''
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
throw invalidChar(read())
case undefined:
throw invalidChar(read())
}
return read()
}
function hexEscape () {
let buffer = ''
let c = peek()
if (!util.isHexDigit(c)) {
throw invalidChar(read())
}
buffer += read()
c = peek()
if (!util.isHexDigit(c)) {
throw invalidChar(read())
}
buffer += read()
return String.fromCodePoint(parseInt(buffer, 16))
}
function unicodeEscape () {
let buffer = ''
let count = 4
while (count-- > 0) {
const c = peek()
if (!util.isHexDigit(c)) {
throw invalidChar(read())
}
buffer += read()
}
return String.fromCodePoint(parseInt(buffer, 16))
}
const parseStates = {
start () {
if (token.type === 'eof') {
throw invalidEOF()
}
convertIdentifierToValue()
push()
},
beforePropertyName () {
switch (token.type) {
case 'identifier':
case 'string':
key = token.value
parseState = 'afterPropertyName'
return
case 'punctuator':
// This code is unreachable since it's handled by the lexState.
// if (token.value !== '}') {
// throw invalidToken()
// }
pop()
return
case 'eof':
throw invalidEOF()
}
// This code is unreachable since it's handled by the lexState.
// throw invalidToken()
},
afterPropertyName () {
// This code is unreachable since it's handled by the lexState.
// if (token.type !== 'punctuator' || token.value !== ':') {
// throw invalidToken()
// }
if (token.type === 'eof') {
throw invalidEOF()
}
parseState = 'beforePropertyValue'
},
beforePropertyValue () {
if (token.type === 'eof') {
throw invalidEOF()
}
convertIdentifierToValue()
push()
},
beforeArrayValue () {
if (token.type === 'eof') {
throw invalidEOF()
}
if (token.type === 'punctuator' && token.value === ']') {
pop()
return
}
convertIdentifierToValue()
push()
},
afterPropertyValue () {
// This code is unreachable since it's handled by the lexState.
// if (token.type !== 'punctuator') {
// throw invalidToken()
// }
if (token.type === 'eof') {
throw invalidEOF()
}
switch (token.value) {
case ',':
parseState = 'beforePropertyName'
return
case '}':
pop()
}
// This code is unreachable since it's handled by the lexState.
// throw invalidToken()
},
afterArrayValue () {
// This code is unreachable since it's handled by the lexState.
// if (token.type !== 'punctuator') {
// throw invalidToken()
// }
if (token.type === 'eof') {
throw invalidEOF()
}
switch (token.value) {
case ',':
parseState = 'beforeArrayValue'
return
case ']':
pop()
}
// This code is unreachable since it's handled by the lexState.
// throw invalidToken()
},
end () {
// This code is unreachable since it's handled by the lexState.
// if (token.type !== 'eof') {
// throw invalidToken()
// }
},
}
function convertIdentifierToValue () {
if (token.type === 'identifier') {
switch (token.value) {
case 'null':
token.type = 'null'
token.value = null
return
case 'true':
token.type = 'boolean'
token.value = true
return
case 'false':
token.type = 'boolean'
token.value = false
return
case 'Infinity':
token.type = 'numeric'
token.value = Infinity
return
case 'NaN':
token.type = 'numeric'
token.value = NaN
}
token.type = 'string'
}
}
function push () {
let value
switch (token.type) {
case 'punctuator':
switch (token.value) {
case '{':
value = {}
break
case '[':
value = []
break
}
break
case 'null':
case 'boolean':
case 'numeric':
case 'string':
value = token.value
break
// This code is unreachable.
// default:
// throw invalidToken()
}
if (root === undefined) {
root = value
} else {
const parent = stack[stack.length - 1]
if (Array.isArray(parent)) {
parent.push(value)
} else {
parent[key] = value
}
}
if (value !== null && typeof value === 'object') {
stack.push(value)
if (Array.isArray(value)) {
parseState = 'beforeArrayValue'
} else {
parseState = 'beforePropertyName'
}
} else {
const current = stack[stack.length - 1]
if (current == null) {
parseState = 'end'
} else if (Array.isArray(current)) {
parseState = 'afterArrayValue'
} else {
parseState = 'afterPropertyValue'
}
}
}
function pop () {
stack.pop()
const current = stack[stack.length - 1]
if (current == null) {
parseState = 'end'
} else if (Array.isArray(current)) {
parseState = 'afterArrayValue'
} else {
parseState = 'afterPropertyValue'
}
}
// This code is unreachable.
// function invalidParseState () {
// return new Error(`JSON5: invalid parse state '${parseState}'`)
// }
// This code is unreachable.
// function invalidLexState (state) {
// return new Error(`JSON5: invalid lex state '${state}'`)
// }
function invalidChar (c) {
if (c === undefined) {
return syntaxError(`JSON5: invalid end of input at ${line}:${column}`)
}
return syntaxError(`JSON5: invalid character '${formatChar(c)}' at ${line}:${column}`)
}
function invalidEOF () {
return syntaxError(`JSON5: invalid end of input at ${line}:${column}`)
}
// This code is unreachable.
// function invalidToken () {
// if (token.type === 'eof') {
// return syntaxError(`JSON5: invalid end of input at ${line}:${column}`)
// }
// const c = String.fromCodePoint(token.value.codePointAt(0))
// return syntaxError(`JSON5: invalid character '${formatChar(c)}' at ${line}:${column}`)
// }
function invalidIdentifier () {
column -= 5
return syntaxError(`JSON5: invalid identifier character at ${line}:${column}`)
}
function separatorChar (c) {
console.warn(`JSON5: '${formatChar(c)}' in strings is not valid ECMAScript; consider escaping`)
}
function formatChar (c) {
const replacements = {
"'": "\\'",
'"': '\\"',
'\\': '\\\\',
'\b': '\\b',
'\f': '\\f',
'\n': '\\n',
'\r': '\\r',
'\t': '\\t',
'\v': '\\v',
'\0': '\\0',
'\u2028': '\\u2028',
'\u2029': '\\u2029',
}
if (replacements[c]) {
return replacements[c]
}
if (c < ' ') {
const hexString = c.charCodeAt(0).toString(16)
return '\\x' + ('00' + hexString).substring(hexString.length)
}
return c
}
function syntaxError (message) {
const err = new SyntaxError(message)
err.lineNumber = line
err.columnNumber = column
return err
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment