Skip to content

Instantly share code, notes, and snippets.

@nolanw
Last active July 2, 2024 22:22
Show Gist options
  • Save nolanw/603eb96656570d4ff47635b77279eece to your computer and use it in GitHub Desktop.
Save nolanw/603eb96656570d4ff47635b77279eece to your computer and use it in GitHub Desktop.
Load JWCC: JSON With Commas and Comments
# Lil loading tool for JWCC: JSON With Commas and Comments.
# See docstrings for loads() and jwcc2json() for more info.
# Public domain.
import json
import re
_INTERESTING_B = re.compile(rb'''
\] # array end
| \} # object end
| (?<! \\)" # string start
| // # line comment start
| /\* # block comment start
''', flags=re.VERBOSE)
_INTERESTING_S = re.compile(r'''
\] # array end
| \} # object end
| (?<! \\)" # string start
| // # line comment start
| /\* # block comment start
''', flags=re.VERBOSE)
_STRING_END_B = re.compile(rb'(?<!\\)"')
_STRING_END_S = re.compile(r'(?<!\\)"')
_TRAILING_COMMA_B = re.compile(rb',([ \t\r\n]*)\Z')
_TRAILING_COMMA_S = re.compile(r',([ \t\r\n]*)\Z')
def loads(jwcc_text, *args, **kwargs):
"""Deserialize jwcc_text (a str, bytes, or bytearray instance containing a JSON document) to a Python object using json.loads().
The text is turned into valid JSON using jwcc2json(), then passed to json.loads along with all other parameters.
>>> loads('[1,2,/*hello*/3,]')
[1, 2, 3]
>>> loads(b'[1,2,/*hello*/3,]')
[1, 2, 3]
>>> loads(bytearray(b'[1,2,/*hello*/3,]'))
[1, 2, 3]
Line numbers should match in any exceptions thrown by the underling JSON parser. (Columns, not so much :)
>>> loads('''
... // Hi there!
... {wrong}
... ''')
Traceback (most recent call last):
...
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 3 column 2 (char 3)
"""
json_text = jwcc2json(jwcc_text)
return json.loads(json_text, *args, **kwargs)
def jwcc2json(jwcc_text):
r"""Return a copy of the JWCC text with the JWCC parts removed.
JWCC is JSON With Commas and Comments. It is yet another superset-of-JSON. It adds:
• Commas, allowing an optional comma after the final element of an array or object.
• Comments, both `/* slash-star block comments */` and `// double-slash line comments`, anywhere that JSON allows whitespace.
This function does not implement a full parser. It assumes the passed-in JWCC text is valid, removes the Commas and replaces the Comments that are not valid JSON, then returns the result. Care is taken not to change the line count or offset, so parse errors from the eventual JSON parser will point to the correct line in the JWCC text.
jwcc_text must be a str, bytes, or bytesarray instance.
Here are some usage examples. Commas:
>>> jwcc2json('[1,2,3,]')
'[1,2,3]'
>>> jwcc2json('[,]')
'[]'
>>> jwcc2json('["hu,]h",]')
'["hu,]h"]'
Comments:
>>> jwcc2json('/* ahoyhoy */ "I am a JSON text now"')
' "I am a JSON text now"'
>>> jwcc2json('[1, /* 2, */ 3]')
'[1, 3]'
>>> jwcc2json('"hel/*p me*/o"')
'"hel/*p me*/o"'
>>> jwcc2json('''
... {
... /* Please parse this as an ISO 8601 date. */
... "lunchtime": "2009-10-11T12:13:14Z"
... }''')
'\n{\n \n "lunchtime": "2009-10-11T12:13:14Z"\n}'
>>> jwcc2json('true // dat')
'true '
>>> jwcc2json('''
... [ 0
... // , 1
... , 2
... ]''')
'\n[ 0\n\n, 2\n]'
>>> jwcc2json('''
... { "yes": true
... , "no": false,
... // , "maybe": 2
... , "sorta": -2
... /* , "coulda": 8
... , "shoulda": 9 */
... }''')
'\n{ "yes": true\n, "no": false,\n\n, "sorta": -2\n\n\n}'
>>> jwcc2json('''
... {"": 0,
... //
... }
... ''')
'\n{"": 0\n\n}\n'
>>> jwcc2json(r'{"\"": 0 /**/ }')
'{"\\"": 0 }'
All together:
>>> jwcc2json('[1,2,/*hello*/3,]')
'[1,2, 3]'
See https://nigeltao.github.io/blog/2021/json-with-commas-comments.html for the JWCC specification, and https://www.rfc-editor.org/rfc/rfc8259 for the JSON specification.
"""
# Accumulate bits of the JWCC text to eventually join into JSON text.
json_parts = []
# Choose correct regexen
if isinstance(jwcc_text, str):
INTERESTING = _INTERESTING_S
STRING_END = _STRING_END_S
TRAILING_COMMA = _TRAILING_COMMA_S
block_comment_end = '*/'
join = ''.join
newline = '\n'
space = ' '
else:
INTERESTING = _INTERESTING_B
STRING_END = _STRING_END_B
TRAILING_COMMA = _TRAILING_COMMA_B
block_comment_end = b'*/'
join = b''.join
newline = b'\n'
space = b' '
# Scan JWCC text, replacing or skipping the non-JSON parts.
cur = 0
while interesting := INTERESTING.search(jwcc_text, cur):
# Whatever we skipped over that wasn't interesting.
if (start := interesting.start()) > cur:
json_parts.append(jwcc_text[cur:start])
cur = start
# What did we find?
match interesting[0]:
# Array end or object end.
case ']' | '}' | b']' | b'}':
# Check the latest non-whitespace JSON part for a trailing comma, ignoring whitespace.
for i in range(len(json_parts) - 1, -1, -1):
if trail := TRAILING_COMMA.search(json_parts[i]):
json_parts[i] = json_parts[i][:trail.start()] + trail[1]
break
# Not whitespace, no trailing comma
if json_parts[i].strip():
break
json_parts.append(interesting[0])
cur = interesting.end()
# String start.
case '"' | b'"':
# Include everything up to the end of the string and skip over it.
# We must do this before we check for comments, lest we parse a quoted `/*` or `//`.
if end := STRING_END.search(jwcc_text, cur + 1):
json_parts.append(jwcc_text[cur:end.end()])
cur = end.end()
else:
json_parts.append(jwcc_text[cur:])
cur = len(jwcc_text)
# Block comment.
case '/*' | b'/*':
# If it's the rest of the text, we're done.
end = jwcc_text.find(block_comment_end, cur + 2)
if end == -1:
cur = len(jwcc_text)
break
if (newlines := jwcc_text.count(newline, cur + 2, end)):
# If the comment spans multiple lines, skip everything but the newlines.
# Then the line count and line offsets match, and any JSON parsing errors should point to the right line at least.
json_parts.append(newline * newlines)
else:
# If the comment sits on one line, replace it with a space. Comments are allowed wherever JSON whitespace is allowed, so this maintains semantics.
json_parts.append(space)
cur = end + 2
# Line comment.
case '//' | b'//':
# Skip until (but not including) the next line break.
line_break = jwcc_text.find(newline, cur)
if line_break == -1:
cur = len(jwcc_text)
else:
cur = line_break
# Whatever's left that wasn't interesting.
json_parts.append(jwcc_text[cur:])
return join(json_parts)
if __name__ == "__main__":
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment