Skip to content

Instantly share code, notes, and snippets.

@mikesamuel
Last active February 21, 2019 05:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikesamuel/20710f94a53e440691f04bf79bc3d756 to your computer and use it in GitHub Desktop.
Save mikesamuel/20710f94a53e440691f04bf79bc3d756 to your computer and use it in GitHub Desktop.
A canonicalizing function to make it easy to hash JSON
"use strict";
// Prompted by https://esdiscuss.org/topic/json-canonicalize
// Given a string of JSON produces a string of JSON without unnecessary
// degrees of freedom like whitespace, optional escape sequences, and
// unnecessary variance in number representation.
function hashable(json) {
const strs = [] // Side table to collect string bodies
return reorderProperties(
strs,
json
// Store string content in a side-table.
// This makes the content is still valid JSON but without
// any strings that contain whitespace, commas, colons or brackets.
.replace(/"(?:[^\\"]|\\.)*"/g, (x) => { strs.push(x); return `"@${strs.length-1}"` })
// Eliminate all whitespace.
.replace(/[\x00-\x20]+/g, '')
// Canonicalize all numbers.
.replace(/([+\-]?)(?=[\d.])(\d*)(?:[.](\d*))?(?:[eE]([+\-]?\d+))?(?=[,\]\}]|$)/g, canonNumber))
// Restore the numbers from the side table
.replace(/"@(\d+)"/g, canonStr.bind(null, strs))
}
// RegExp replacer function that canonicalizes a number even if it
// is not precisely representable in JavaScript.
function canonNumber(_, sign, integer, fraction, mantissa) {
if (sign !== '-') { sign = '' }
integer = integer || '0'
fraction = fraction || ''
mantissa = (+mantissa || 0) + integer.length - 1
let digits = (integer + fraction)
// Strip zeroes from front
.replace(/^(?:0(?!$))+/, s => (mantissa -= s.length, '')) || '0'
if (digits === '0') { mantissa = 0 }
// Exponential notation.
if (-26 >= mantissa || mantissa >= 26) {
const afterdot = digits.substring(1).replace(/0+$/, '')
return sign + digits[0] + (afterdot ? `.${afterdot}` : '') + 'e' + mantissa
}
return (mantissa < 0)
// Pad zeroes left
// digits=123456, mantissa=-1 -> 0.123456
? sign + '0.' + '0'.repeat(-mantissa - 1) + digits
: (mantissa + 1 < digits.length)
// Dot in the middle
? sign + digits.substring(0, mantissa + 1) + '.' + digits.substring(mantissa + 1)
// Pad zeroes right
// digits=123456, mantissa=6 -> 1234560
: sign + digits + '0'.repeat(mantissa - digits.length + 1)
}
// Pulls a string out of a side table. "@1" -> strs[1] but with escape sequences normalized.
const canonStr = (strs, s) => JSON.stringify(JSON.parse(strs[s.substring(2, s.length - 1)]))
// Recursively reorder properties on JSON {}-style records
function reorderProperties(strs, s) {
// Pull out tokens and their indices as cues to structure
const tokens = []
const tokenPattern = /[{}\[\],]/g
for (let match; (match = tokenPattern.exec(s));) {
tokens.push(match) // Match has form ['{'] but with index propert.
}
if (!tokens.length) { return s }
const memoTable = []
function canonKey(strIndex) {
if (memoTable[strIndex] === undefined) {
memoTable[strIndex] = JSON.parse(strs[strIndex])
}
return memoTable[strIndex]
}
// Walk the structure and rebuild the output but ordered.
return reorder(0, tokens.length - 1)
function reorder(left, right) {
const { "0": token, index } = tokens[left]
let rangeStart = left, valueStart = null
let depth, cursor
let ranges = []
for (depth = 1, cursor = left + 1; cursor < right; ++cursor) {
switch (tokens[cursor][0]) {
case '{': case '[':
if (depth === 1) { valueStart = cursor }
++depth
break
case ']': case '}': --depth; break
case ',':
if (depth === 1) {
ranges.push([rangeStart, valueStart, cursor])
rangeStart = cursor
valueStart = null
}
}
}
ranges.push([rangeStart, valueStart, right])
ranges = ranges.map(
([start, valueStart, end]) => (valueStart === null
// No nested object or array
? s.substring(tokens[start].index + 1, tokens[end].index)
// Recurse to nested object or array
: s.substring(tokens[start].index + 1, tokens[valueStart].index) + reorder(valueStart, end - 1)))
if (token === '{') {
ranges.sort((a, b) => {
const aKey = canonKey(/^"@(\d+)":/.exec(a)[1])
const bKey = canonKey(/^"@(\d+)":/.exec(b)[1])
return aKey < bKey ? -1 : aKey === bKey ? 0 : 1
})
}
return token + ranges.join(',') + tokens[right][0]
}
}
if (typeof module !== 'undefined') { module.exports = { hashable } }
// Tests for canonNumber
void ([
{ input: ['', '123', '456', '0'], want: '123.456' },
{ input: ['+', '123', '456', '0'], want: '123.456' },
{ input: ['-', '123', '456', '0'], want: '-123.456' },
{ input: ['', '123', '456', '30'], want: '1.23456e32' },
{ input: ['', '123', '456', '-30'], want: '1.23456e-28' },
{ input: ['-', '123', '456', '-30'], want: '-1.23456e-28' },
{ input: ['', '123', '456', '-7'], want: '0.0000123456' },
{ input: ['', '123', '456', '-6'], want: '0.000123456' },
{ input: ['', '123', '456', '-5'], want: '0.00123456' },
{ input: ['', '123', '456', '-4'], want: '0.0123456' },
{ input: ['', '123', '456', '-3'], want: '0.123456' },
{ input: ['', '123', '456', '-2'], want: '1.23456' },
{ input: ['', '123', '456', '-1'], want: '12.3456' },
{ input: ['', '123', '456', '-0'], want: '123.456' },
{ input: ['', '123', '456', '0'], want: '123.456' },
{ input: ['', '123', '456', '1'], want: '1234.56' },
{ input: ['', '123', '456', '2'], want: '12345.6' },
{ input: ['', '123', '456', '3'], want: '123456' },
{ input: ['', '123', '456', '4'], want: '1234560' },
{ input: ['', '123', '456', '5'], want: '12345600' },
{ input: ['', '123', '456', '6'], want: '123456000' },
{ input: ['', '123', '456', '7'], want: '1234560000' },
{ input: ['', '0123', '456', '2'], want: '12345.6' },
{ input: ['', '0', '05', '-2'], want: '0.0005' },
{ input: ['', '0', '05', '2'], want: '5' },
].forEach(({ input, want }, i) => {
console.group(`Test #${i}: ${JSON.stringify(input)}`)
const got = canonNumber(0, ...input)
console.log(got)
if (got !== want) {
throw new Error('expected ' + want)
}
console.groupEnd()
}))
// Tests for hashable
void ([
{ input: String.raw`0`, want: String.raw`0` },
{ input: String.raw`0.0`, want: String.raw`0` },
{ input: String.raw`00`, want: String.raw`0` },
{ input: String.raw`1e100`, want: String.raw`1e100` },
{ input: String.raw`"foo"`, want: String.raw`"foo"` },
{ input: String.raw`{}`, want: String.raw`{}` },
{ input: String.raw`{ }`, want: String.raw`{}` },
{ input: String.raw`{ "c": 1, "a": 2, "b": 3 }`, want: String.raw`{"a":2,"b":3,"c":1}` },
{ input: String.raw`{ "c": 1, "a": { "~": 2, "b": 3 }}`, want: String.raw`{"a":{"b":3,"~":2},"c":1}` },
{ input: String.raw`{ "": [] }`, want: String.raw`{"":[]}` },
{ input: String.raw`[]`, want: String.raw`[]` },
{ input: String.raw`[ ]`, want: String.raw`[]` },
{ input: String.raw`"\u0022"`, want: String.raw`"\""` },
{ input: String.raw`"\\\/"`, want: String.raw`"\\/"` },
{
input: String.raw`{
"foo" : {
"boo": null,
"123.456e1" : [ 123.456e1, "foo\/bar\u0022"],
"bar":"baz"
}
} `,
want: String.raw`{"foo":{"123.456e1":[1234.56,"foo/bar\""],"bar":"baz","boo":null}}`
},
].forEach(({ input, want }, i) => {
console.group(`Test #${i}: ${JSON.stringify(input)}`)
const got = hashable(input)
if (got !== want) {
throw new Error(`got ${got}, expected ${want}`)
}
console.groupEnd()
}))
@cyberphone
Copy link

Hi Mike,
Although the JSON canonicalization concept I'm working on is very different to the one above I have adopted a term I got from you "Hashable" JSON:
https://tools.ietf.org/html/draft-rundgren-json-canonicalization-scheme-05
One reviewer claimed it was a very confused term but I think it is perfect : )
Thanx!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment