Last active
December 13, 2022 15:22
-
-
Save fakuivan/c9baf19abfa6196548202d25ced9fb92 to your computer and use it in GitHub Desktop.
Finds valid JSON strings in a file. Useful for reconstructing broken files or reverse engineering data structures in protocols or embedded into binaries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.8 | |
# different from the re module (https://pypi.org/project/regex/ ) | |
from ast import parse | |
from json import dumps | |
from typing import Any, List, NoReturn, TextIO, Tuple | |
import regex | |
import json | |
import sys | |
import argparse | |
# stolen from https://regex101.com/library/tA9pM8 wih minor changes | |
json_regex = regex.compile(r""" | |
(?(DEFINE) | |
(?P<json>(?>\s*(?&object)\s*|\s*(?&array)\s*)) | |
(?P<object>(?>\{\s*(?>(?&pair)(?>\s*,\s*(?&pair))*)?\s*\})) | |
(?P<pair>(?>(?&string)\s*:\s*(?&value))) | |
(?P<array>(?>\[\s*(?>(?&value)(?>\s*,\s*(?&value))*)?\s*\])) | |
(?P<value>(?>true|false|null|(?&string)|(?&number)|(?&object)|(?&array))) | |
(?P<string>(?>"(?>\\(?>["\\\/bfnrt]|u[a-fA-F0-9]{4})|[^"\\\0-\x1F\x7F]+)*")) | |
(?P<number>(?>-?(?>0|[1-9][0-9]*)(?>\.[0-9]+)?(?>[eE][+-]?[0-9]+)?)) | |
) | |
(?&json)""".replace("\n", "")) | |
def main() -> NoReturn: | |
parser = argparse.ArgumentParser( | |
description="Finds valid JSON strings inside a given text") | |
parser.add_argument("infile", type=argparse.FileType("r")) | |
parser.add_argument("--no-validation", "-V", action="store_true", | |
help="Don't validate JSON after regex pass", | |
dest="no_validation") | |
args = parser.parse_args() | |
infile: TextIO = args.infile | |
no_validation: bool = args.no_validation | |
input_: str = infile.read() | |
matches: List[Tuple[int, int]] = list( | |
(match.span() for match in json_regex.finditer(input_))) | |
if not no_validation: | |
for match in matches: | |
try: | |
json.loads(input_[match[0]:match[1]]) | |
except json.JSONDecodeError: | |
print(parser.prog + ": " | |
f"substring of span {match} is not a valid JSON object " | |
"(this should never ever happen, please fix the regex or " | |
"contact the dev)", | |
file=sys.stderr) | |
return sys.exit(1) | |
print(json.dumps(matches)) | |
return sys.exit(0) | |
if __name__ == "__main__": main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment