Created
December 11, 2023 19:56
-
-
Save webstrand/885a3c5fcb608fa4209d230da54a55c9 to your computer and use it in GitHub Desktop.
JSON string parser that generates minimal garbage by avoiding regex match objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function parseString(json: string, startIndex: number) { | |
const Text = /[^\\"]+/y; | |
const Escapes = /\\+/y; | |
const Unicode = /[\dA-Fa-f]{4}/y; | |
let result = ""; | |
for(let textStart = Text.lastIndex = startIndex;;) { | |
// Scan forward from the lastIndex until we encounter: | |
// 1. closing quote | |
// 3. end-of-string | |
// 2. escape | |
if(!Text.test(json)) throw new Error("unterminated string literal"); | |
const opStart = Text.lastIndex; | |
if(json.charCodeAt(opStart) === 34 /* `"` */) { | |
// 1. We encountered a closing quote | |
return result + json.slice(textStart, opStart); | |
} | |
else if(opStart === json.length) { | |
// 2. We encountered the end-of-string | |
throw new Error("Unterminated string"); | |
} | |
// 3. We encountered an escape | |
assert(json.charCodeAt(opStart) === 92 /* `\\` */, "LOGIC: We did not encounter an escape"); | |
// Scan forward consuming all escapes | |
Escapes.lastIndex = opStart; | |
const foundSeq = Escapes.test(json); | |
assert(foundSeq, `LOGIC: Escapes failed to match after Op: ${json.slice(opStart)}`); | |
const seqEnd = Escapes.lastIndex; | |
const seqLen = seqEnd - opStart; | |
// At this point we append the text segments and, if there are an even | |
// number of escapes, half of them to the result. | |
result += json.slice(textStart, seqEnd - seqLen / 2); | |
if(seqLen % 2 === 0) { | |
if(json.charCodeAt(seqEnd) === 34 /* `"` */) return result; | |
textStart = Text.lastIndex = seqEnd; | |
} | |
else { | |
switch(json.charCodeAt(seqEnd)) { | |
case 34: // `"` | |
case 47: // `/` | |
textStart = seqEnd; | |
Text.lastIndex = seqEnd + 1; | |
break; | |
case 98: // `b` | |
result += `\b`; | |
textStart = Text.lastIndex = seqEnd + 1; | |
break; | |
case 102: // `f` | |
result += `\f`; | |
textStart = Text.lastIndex = seqEnd + 1; | |
break; | |
case 110: // `n` | |
result += `\n`; | |
textStart = Text.lastIndex = seqEnd + 1; | |
break; | |
case 114: // `r` | |
result += `\r`; | |
textStart = Text.lastIndex = seqEnd + 1; | |
break; | |
case 116: // `t` | |
result += `\t`; | |
textStart = Text.lastIndex = seqEnd + 1; | |
break; | |
case 117: // `u` | |
Unicode.lastIndex = seqEnd + 1; | |
const match = Unicode.exec(json); | |
if(!match) throw new Error(`Invalid unicode escape sequence \\${json.slice(seqEnd, seqEnd + 5)}`); | |
result += String.fromCharCode(parseInt(match[0], 16)) | |
textStart = Text.lastIndex = seqEnd + 5; | |
break; | |
default: throw new Error(`Unrecognizable escape sequence \\${json[seqEnd]}`); | |
} | |
} | |
} | |
} | |
console.log(parseString(String.raw`foobarbaz: "hello \"world\" \this is (\u0072) a \\q test\\", contentThatComeAfter`, 12)); | |
function assert(condition: unknown, message: string | (() => string), cons: new (message: string) => Error = Error): asserts condition { if(!condition) throw new cons(typeof message === "function" ? message() : message) } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment