Skip to content

Instantly share code, notes, and snippets.

@Dexter1618
Last active August 6, 2021 08:55
Show Gist options
  • Save Dexter1618/cc28b7f2650d1c146362394486dcedde to your computer and use it in GitHub Desktop.
Save Dexter1618/cc28b7f2650d1c146362394486dcedde to your computer and use it in GitHub Desktop.
This python function can read a large JSON export from MongoDB as a generator
def read_large_json_file(json_document:str, lines = -1):
'''
This function accepts the path to a large JSON file exported from MongoDB
and reads each JSON document one by one as a generator. Use this function in a loop as:
`for document in read_large_json_file("[path to json export].json)`. The parameter `lines`
limit how many files in the export file it will read and is put there as a safe measure in case you have a very large
file.
'''
import json
with open(json_document, "r") as f:
event = []
open_brackets = 0
i = 0
j = 0
for line in f:
line = line.rstrip().lstrip()
line_inserted = False
if "{" in line:
if open_brackets < 1:
# i.e this implies a new json is being created here
event = ["{"]
else:
# i.e this implies a nested json is being created here
if line == "},{":
# marking the end of a document
event.append("}")
# i.e a single JSON document has been parsed completely
json_document = "".join(event)
yield json.loads(json_document)
i = i + 1
event = ["{"]
elif line == "[{":
# marking a beginning and end of a document
event = ["{"]
else:
event.append(line)
open_brackets = open_brackets + 1
elif "}" in line:
if open_brackets < 1:
# this cannot happen
raise Exception("No brackets opened to close now for line = %s" % line)
event.append(line)
open_brackets = open_brackets - 1
if open_brackets < 1:
# i.e a single JSON document has been parsed completely
json_document = "".join(event)
yield json.loads(json_document)
i = i + 1
event = ["{"]
else:
event.append(line)
j = j + 1
if lines > 0:
if j > lines:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment