Last active
August 6, 2021 08:55
-
-
Save Dexter1618/cc28b7f2650d1c146362394486dcedde to your computer and use it in GitHub Desktop.
This python function can read a large JSON export from MongoDB as a generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_large_json_file(json_document:str, lines = -1): | |
''' | |
This function accepts the path to a large JSON file exported from MongoDB | |
and reads each JSON document one by one as a generator. Use this function in a loop as: | |
`for document in read_large_json_file("[path to json export].json)`. The parameter `lines` | |
limit how many files in the export file it will read and is put there as a safe measure in case you have a very large | |
file. | |
''' | |
import json | |
with open(json_document, "r") as f: | |
event = [] | |
open_brackets = 0 | |
i = 0 | |
j = 0 | |
for line in f: | |
line = line.rstrip().lstrip() | |
line_inserted = False | |
if "{" in line: | |
if open_brackets < 1: | |
# i.e this implies a new json is being created here | |
event = ["{"] | |
else: | |
# i.e this implies a nested json is being created here | |
if line == "},{": | |
# marking the end of a document | |
event.append("}") | |
# i.e a single JSON document has been parsed completely | |
json_document = "".join(event) | |
yield json.loads(json_document) | |
i = i + 1 | |
event = ["{"] | |
elif line == "[{": | |
# marking a beginning and end of a document | |
event = ["{"] | |
else: | |
event.append(line) | |
open_brackets = open_brackets + 1 | |
elif "}" in line: | |
if open_brackets < 1: | |
# this cannot happen | |
raise Exception("No brackets opened to close now for line = %s" % line) | |
event.append(line) | |
open_brackets = open_brackets - 1 | |
if open_brackets < 1: | |
# i.e a single JSON document has been parsed completely | |
json_document = "".join(event) | |
yield json.loads(json_document) | |
i = i + 1 | |
event = ["{"] | |
else: | |
event.append(line) | |
j = j + 1 | |
if lines > 0: | |
if j > lines: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment