Skip to content

Instantly share code, notes, and snippets.

@fakuivan
Last active December 13, 2022 15:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save fakuivan/c9baf19abfa6196548202d25ced9fb92 to your computer and use it in GitHub Desktop.
Save fakuivan/c9baf19abfa6196548202d25ced9fb92 to your computer and use it in GitHub Desktop.
Finds valid JSON strings in a file. Useful for reconstructing broken files or reverse engineering data structures in protocols or embedded into binaries
#!/usr/bin/env python3.8
# different from the re module (https://pypi.org/project/regex/ )
from ast import parse
from json import dumps
from typing import Any, List, NoReturn, TextIO, Tuple
import regex
import json
import sys
import argparse
# stolen from https://regex101.com/library/tA9pM8 wih minor changes
json_regex = regex.compile(r"""
(?(DEFINE)
(?P<json>(?>\s*(?&object)\s*|\s*(?&array)\s*))
(?P<object>(?>\{\s*(?>(?&pair)(?>\s*,\s*(?&pair))*)?\s*\}))
(?P<pair>(?>(?&string)\s*:\s*(?&value)))
(?P<array>(?>\[\s*(?>(?&value)(?>\s*,\s*(?&value))*)?\s*\]))
(?P<value>(?>true|false|null|(?&string)|(?&number)|(?&object)|(?&array)))
(?P<string>(?>"(?>\\(?>["\\\/bfnrt]|u[a-fA-F0-9]{4})|[^"\\\0-\x1F\x7F]+)*"))
(?P<number>(?>-?(?>0|[1-9][0-9]*)(?>\.[0-9]+)?(?>[eE][+-]?[0-9]+)?))
)
(?&json)""".replace("\n", ""))
def main() -> NoReturn:
parser = argparse.ArgumentParser(
description="Finds valid JSON strings inside a given text")
parser.add_argument("infile", type=argparse.FileType("r"))
parser.add_argument("--no-validation", "-V", action="store_true",
help="Don't validate JSON after regex pass",
dest="no_validation")
args = parser.parse_args()
infile: TextIO = args.infile
no_validation: bool = args.no_validation
input_: str = infile.read()
matches: List[Tuple[int, int]] = list(
(match.span() for match in json_regex.finditer(input_)))
if not no_validation:
for match in matches:
try:
json.loads(input_[match[0]:match[1]])
except json.JSONDecodeError:
print(parser.prog + ": "
f"substring of span {match} is not a valid JSON object "
"(this should never ever happen, please fix the regex or "
"contact the dev)",
file=sys.stderr)
return sys.exit(1)
print(json.dumps(matches))
return sys.exit(0)
if __name__ == "__main__": main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment