Skip to content

Instantly share code, notes, and snippets.

@jpmckinney
Last active May 10, 2021 18:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpmckinney/d3750bb2062504102a7a3d211dfd1428 to your computer and use it in GitHub Desktop.
Save jpmckinney/d3750bb2062504102a7a3d211dfd1428 to your computer and use it in GitHub Desktop.
Fixes errors in invalid JSON. Running time is (length of input) x (number of errors).
#!/usr/bin/env python
import json
import sys
def run(filename):
with open(filename) as f:
s = f.read()
# This code will re-read the file from the beginning after fixing each error, which is slow. A better solution
# would be to write/extend a JSON parser that can fix errors as it reads the data in a single pass.
#
# If the errors occur in predictable locations (e.g. control characters only occur inside strings and can therefore
# all be escaped), then string substitution and regular expressions can be used.
#
# Inspired by https://stackoverflow.com/questions/18514910/how-do-i-automatically-fix-an-invalid-json-string
while True:
try:
result = json.loads(s)
break
except Exception as e:
if e.msg not in (
# Occurs after an unescaped quote character followed by a non-comma.
"Expecting ',' delimiter",
# Occurs after an unescaped quote character followed by a comma.
"Expecting property name enclosed in double quotes",
# Occurs for unescaped characters like tab, etc.
"Invalid control character at",
):
# Print any errors we can't handle, to be added to this code.
print(repr([str(e), s[s.rindex('"', 0, e.pos - 1):s.index('"', e.pos) + 2]]), file=sys.stderr)
break
if e.msg in ("Expecting ',' delimiter", "Expecting property name enclosed in double quotes"):
# If the quote character is followed by whitespace, the error position will not be immediately after
# the quote character, so we search backwards for it.
unescaped = s.rindex('\"', 0, e.pos)
replacement = r'\"'
elif e.msg == "Invalid control character at":
unescaped = e.pos
replacement = json.dumps(s[e.pos])[1:-1]
s = s[:unescaped] + replacement + s[unescaped + 1:]
print('.', end='', file=sys.stderr, flush=True)
print(result)
if __name__ == '__main__':
run(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment