nolanw/jwcc.py

## jwcc.py
# Lil loading tool for JWCC: JSON With Commas and Comments.
# See docstrings for loads() and jwcc2json() for more info.
# Public domain.
import json
import re

_INTERESTING_B = re.compile(rb'''
    \]           # array end
    | \}         # object end
    | (?<! \\)"  # string start
    | //         # line comment start
    | /\*        # block comment start
''', flags=re.VERBOSE)

_INTERESTING_S = re.compile(r'''
    \]           # array end
    | \}         # object end
    | (?<! \\)"  # string start
    | //         # line comment start
    | /\*        # block comment start
''', flags=re.VERBOSE)

_STRING_END_B = re.compile(rb'(?<!\\)"')
_STRING_END_S = re.compile(r'(?<!\\)"')

_TRAILING_COMMA_B = re.compile(rb',([ \t\r\n]*)\Z')
_TRAILING_COMMA_S = re.compile(r',([ \t\r\n]*)\Z')


def loads(jwcc_text, *args, **kwargs):
    """Deserialize jwcc_text (a str, bytes, or bytearray instance containing a JSON document) to a Python object using json.loads().

    The text is turned into valid JSON using jwcc2json(), then passed to json.loads along with all other parameters.

    >>> loads('[1,2,/*hello*/3,]')
    [1, 2, 3]
    >>> loads(b'[1,2,/*hello*/3,]')
    [1, 2, 3]
    >>> loads(bytearray(b'[1,2,/*hello*/3,]'))
    [1, 2, 3]

    Line numbers should match in any exceptions thrown by the underling JSON parser. (Columns, not so much :)

    >>> loads('''
    ... // Hi there!
    ... {wrong}
    ... ''')
    Traceback (most recent call last):
        ...
    json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 3 column 2 (char 3)
    """
    json_text = jwcc2json(jwcc_text)
    return json.loads(json_text, *args, **kwargs)


def jwcc2json(jwcc_text):
    r"""Return a copy of the JWCC text with the JWCC parts removed.

    JWCC is JSON With Commas and Comments. It is yet another superset-of-JSON. It adds:

    • Commas, allowing an optional comma after the final element of an array or object.
    • Comments, both `/* slash-star block comments */` and `// double-slash line comments`, anywhere that JSON allows whitespace.

    This function does not implement a full parser. It assumes the passed-in JWCC text is valid, removes the Commas and replaces the Comments that are not valid JSON, then returns the result. Care is taken not to change the line count or offset, so parse errors from the eventual JSON parser will point to the correct line in the JWCC text.

    jwcc_text must be a str, bytes, or bytesarray instance.

    Here are some usage examples. Commas:
    >>> jwcc2json('[1,2,3,]')
    '[1,2,3]'
    >>> jwcc2json('[,]')
    '[]'
    >>> jwcc2json('["hu,]h",]')
    '["hu,]h"]'

    Comments:
    >>> jwcc2json('/* ahoyhoy */ "I am a JSON text now"')
    '  "I am a JSON text now"'
    >>> jwcc2json('[1, /* 2, */ 3]')
    '[1,   3]'
    >>> jwcc2json('"hel/*p me*/o"')
    '"hel/*p me*/o"'
    >>> jwcc2json('''
    ... {
    ...   /* Please parse this as an ISO 8601 date. */
    ...   "lunchtime": "2009-10-11T12:13:14Z"
    ... }''')
    '\n{\n   \n  "lunchtime": "2009-10-11T12:13:14Z"\n}'
    >>> jwcc2json('true // dat')
    'true '
    >>> jwcc2json('''
    ... [ 0
    ... // , 1
    ... , 2
    ... ]''')
    '\n[ 0\n\n, 2\n]'
    >>> jwcc2json('''
    ... { "yes": true
    ... , "no": false,
    ... // , "maybe": 2
    ... , "sorta": -2
    ... /* , "coulda": 8
    ... , "shoulda": 9 */
    ... }''')
    '\n{ "yes": true\n, "no": false,\n\n, "sorta": -2\n\n\n}'
    >>> jwcc2json('''
    ... {"": 0,
    ... //
    ... }
    ... ''')
    '\n{"": 0\n\n}\n'
    >>> jwcc2json(r'{"\"": 0 /**/ }')
    '{"\\"": 0   }'

    All together:
    >>> jwcc2json('[1,2,/*hello*/3,]')
    '[1,2, 3]'

    See https://nigeltao.github.io/blog/2021/json-with-commas-comments.html for the JWCC specification, and https://www.rfc-editor.org/rfc/rfc8259 for the JSON specification.
    """
    # Accumulate bits of the JWCC text to eventually join into JSON text.
    json_parts = []

    # Choose correct regexen
    if isinstance(jwcc_text, str):
        INTERESTING = _INTERESTING_S
        STRING_END = _STRING_END_S
        TRAILING_COMMA = _TRAILING_COMMA_S
        block_comment_end = '*/'
        join = ''.join
        newline = '\n'
        space = ' '
    else:
        INTERESTING = _INTERESTING_B
        STRING_END = _STRING_END_B
        TRAILING_COMMA = _TRAILING_COMMA_B
        block_comment_end = b'*/'
        join = b''.join
        newline = b'\n'
        space = b' '

    # Scan JWCC text, replacing or skipping the non-JSON parts.
    cur = 0
    while interesting := INTERESTING.search(jwcc_text, cur):

        # Whatever we skipped over that wasn't interesting.
        if (start := interesting.start()) > cur:
            json_parts.append(jwcc_text[cur:start])
            cur = start

        # What did we find?
        match interesting[0]:

            # Array end or object end.
            case ']' | '}' | b']' | b'}':

                # Check the latest non-whitespace JSON part for a trailing comma, ignoring whitespace.
                for i in range(len(json_parts) - 1, -1, -1):
                    if trail := TRAILING_COMMA.search(json_parts[i]):
                        json_parts[i] = json_parts[i][:trail.start()] + trail[1]
                        break
                    # Not whitespace, no trailing comma
                    if json_parts[i].strip():
                        break

                json_parts.append(interesting[0])
                cur = interesting.end()

            # String start.
            case '"' | b'"':

                # Include everything up to the end of the string and skip over it.
                # We must do this before we check for comments, lest we parse a quoted `/*` or `//`.
                if end := STRING_END.search(jwcc_text, cur + 1):
                    json_parts.append(jwcc_text[cur:end.end()])
                    cur = end.end()
                else:
                    json_parts.append(jwcc_text[cur:])
                    cur = len(jwcc_text)

            # Block comment.
            case '/*' | b'/*':

                # If it's the rest of the text, we're done.
                end = jwcc_text.find(block_comment_end, cur + 2)
                if end == -1:
                    cur = len(jwcc_text)
                    break

                if (newlines := jwcc_text.count(newline, cur + 2, end)):
                    # If the comment spans multiple lines, skip everything but the newlines.
                    # Then the line count and line offsets match, and any JSON parsing errors should point to the right line at least.
                    json_parts.append(newline * newlines)
                else:
                    # If the comment sits on one line, replace it with a space. Comments are allowed wherever JSON whitespace is allowed, so this maintains semantics.
                    json_parts.append(space)
                cur = end + 2

            # Line comment.
            case '//' | b'//':

                # Skip until (but not including) the next line break.
                line_break = jwcc_text.find(newline, cur)
                if line_break == -1:
                    cur = len(jwcc_text)
                else:
                    cur = line_break

    # Whatever's left that wasn't interesting.
    json_parts.append(jwcc_text[cur:])

    return join(json_parts)


if __name__ == "__main__":
    import doctest
    doctest.testmod()
	# Lil loading tool for JWCC: JSON With Commas and Comments.
	# See docstrings for loads() and jwcc2json() for more info.
	# Public domain.
	import json
	import re

	_INTERESTING_B = re.compile(rb'''
	\] # array end
	\| \} # object end
	\| (?<! \\)" # string start
	\| // # line comment start
	\| /\* # block comment start
	''', flags=re.VERBOSE)

	_INTERESTING_S = re.compile(r'''
	\] # array end
	\| \} # object end
	\| (?<! \\)" # string start
	\| // # line comment start
	\| /\* # block comment start
	''', flags=re.VERBOSE)

	_STRING_END_B = re.compile(rb'(?<!\\)"')
	_STRING_END_S = re.compile(r'(?<!\\)"')

	_TRAILING_COMMA_B = re.compile(rb',([ \t\r\n]*)\Z')
	_TRAILING_COMMA_S = re.compile(r',([ \t\r\n]*)\Z')


	def loads(jwcc_text, args, *kwargs):
	"""Deserialize jwcc_text (a str, bytes, or bytearray instance containing a JSON document) to a Python object using json.loads().

	The text is turned into valid JSON using jwcc2json(), then passed to json.loads along with all other parameters.

	>>> loads('[1,2,/hello/3,]')
	[1, 2, 3]
	>>> loads(b'[1,2,/hello/3,]')
	[1, 2, 3]
	>>> loads(bytearray(b'[1,2,/hello/3,]'))
	[1, 2, 3]

	Line numbers should match in any exceptions thrown by the underling JSON parser. (Columns, not so much :)

	>>> loads('''
	... // Hi there!
	... {wrong}
	... ''')
	Traceback (most recent call last):
	...
	json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 3 column 2 (char 3)
	"""
	json_text = jwcc2json(jwcc_text)
	return json.loads(json_text, args, *kwargs)


	def jwcc2json(jwcc_text):
	r"""Return a copy of the JWCC text with the JWCC parts removed.

	JWCC is JSON With Commas and Comments. It is yet another superset-of-JSON. It adds:

	• Commas, allowing an optional comma after the final element of an array or object.
	• Comments, both `/* slash-star block comments */` and `// double-slash line comments`, anywhere that JSON allows whitespace.

	This function does not implement a full parser. It assumes the passed-in JWCC text is valid, removes the Commas and replaces the Comments that are not valid JSON, then returns the result. Care is taken not to change the line count or offset, so parse errors from the eventual JSON parser will point to the correct line in the JWCC text.

	jwcc_text must be a str, bytes, or bytesarray instance.

	Here are some usage examples. Commas:
	>>> jwcc2json('[1,2,3,]')
	'[1,2,3]'
	>>> jwcc2json('[,]')
	'[]'
	>>> jwcc2json('["hu,]h",]')
	'["hu,]h"]'

	Comments:
	>>> jwcc2json('/* ahoyhoy */ "I am a JSON text now"')
	' "I am a JSON text now"'
	>>> jwcc2json('[1, /* 2, */ 3]')
	'[1, 3]'
	>>> jwcc2json('"hel/p me/o"')
	'"hel/p me/o"'
	>>> jwcc2json('''
	... {
	... /* Please parse this as an ISO 8601 date. */
	... "lunchtime": "2009-10-11T12:13:14Z"
	... }''')
	'\n{\n \n "lunchtime": "2009-10-11T12:13:14Z"\n}'
	>>> jwcc2json('true // dat')
	'true '
	>>> jwcc2json('''
	... [ 0
	... // , 1
	... , 2
	... ]''')
	'\n[ 0\n\n, 2\n]'
	>>> jwcc2json('''
	... { "yes": true
	... , "no": false,
	... // , "maybe": 2
	... , "sorta": -2
	... /* , "coulda": 8
	... , "shoulda": 9 */
	... }''')
	'\n{ "yes": true\n, "no": false,\n\n, "sorta": -2\n\n\n}'
	>>> jwcc2json('''
	... {"": 0,
	... //
	... }
	... ''')
	'\n{"": 0\n\n}\n'
	>>> jwcc2json(r'{"\"": 0 /**/ }')
	'{"\\"": 0 }'

	All together:
	>>> jwcc2json('[1,2,/hello/3,]')
	'[1,2, 3]'

	See https://nigeltao.github.io/blog/2021/json-with-commas-comments.html for the JWCC specification, and https://www.rfc-editor.org/rfc/rfc8259 for the JSON specification.
	"""
	# Accumulate bits of the JWCC text to eventually join into JSON text.
	json_parts = []

	# Choose correct regexen
	if isinstance(jwcc_text, str):
	INTERESTING = _INTERESTING_S
	STRING_END = _STRING_END_S
	TRAILING_COMMA = _TRAILING_COMMA_S
	block_comment_end = '*/'
	join = ''.join
	newline = '\n'
	space = ' '
	else:
	INTERESTING = _INTERESTING_B
	STRING_END = _STRING_END_B
	TRAILING_COMMA = _TRAILING_COMMA_B
	block_comment_end = b'*/'
	join = b''.join
	newline = b'\n'
	space = b' '

	# Scan JWCC text, replacing or skipping the non-JSON parts.
	cur = 0
	while interesting := INTERESTING.search(jwcc_text, cur):

	# Whatever we skipped over that wasn't interesting.
	if (start := interesting.start()) > cur:
	json_parts.append(jwcc_text[cur:start])
	cur = start

	# What did we find?
	match interesting[0]:

	# Array end or object end.
	case ']' \| '}' \| b']' \| b'}':

	# Check the latest non-whitespace JSON part for a trailing comma, ignoring whitespace.
	for i in range(len(json_parts) - 1, -1, -1):
	if trail := TRAILING_COMMA.search(json_parts[i]):
	json_parts[i] = json_parts[i][:trail.start()] + trail[1]
	break
	# Not whitespace, no trailing comma
	if json_parts[i].strip():
	break

	json_parts.append(interesting[0])
	cur = interesting.end()

	# String start.
	case '"' \| b'"':

	# Include everything up to the end of the string and skip over it.
	# We must do this before we check for comments, lest we parse a quoted `/*` or `//`.
	if end := STRING_END.search(jwcc_text, cur + 1):
	json_parts.append(jwcc_text[cur:end.end()])
	cur = end.end()
	else:
	json_parts.append(jwcc_text[cur:])
	cur = len(jwcc_text)

	# Block comment.
	case '/' \| b'/':

	# If it's the rest of the text, we're done.
	end = jwcc_text.find(block_comment_end, cur + 2)
	if end == -1:
	cur = len(jwcc_text)
	break

	if (newlines := jwcc_text.count(newline, cur + 2, end)):
	# If the comment spans multiple lines, skip everything but the newlines.
	# Then the line count and line offsets match, and any JSON parsing errors should point to the right line at least.
	json_parts.append(newline * newlines)
	else:
	# If the comment sits on one line, replace it with a space. Comments are allowed wherever JSON whitespace is allowed, so this maintains semantics.
	json_parts.append(space)
	cur = end + 2

	# Line comment.
	case '//' \| b'//':

	# Skip until (but not including) the next line break.
	line_break = jwcc_text.find(newline, cur)
	if line_break == -1:
	cur = len(jwcc_text)
	else:
	cur = line_break

	# Whatever's left that wasn't interesting.
	json_parts.append(jwcc_text[cur:])

	return join(json_parts)


	if __name__ == "__main__":
	import doctest
	doctest.testmod()