Reading chunks of a json file and identifying each json structures as they chunks are read
"""reading from a json file in chunks and added each json structure on a list | |
""" | |
import re | |
import json | |
# path to a json file | |
json_file = '<<fill with a complete path to a json file>>' | |
# this variable hold text read from file until found a json structure | |
text_read = '' | |
# every json structure found is added as dict on a list | |
lst_dct_jsons = [] | |
# this variable hold json structure found | |
json_founded = '' | |
def json_from_text(txt): | |
""" | |
:param txt: json string read from a json file | |
:return: one json only structure found | |
source:https://stackoverflow.com/a/34960703 | |
""" | |
match = re.findall('{.*?}', txt) | |
return match[0] if match else None | |
def read_in_chunks(file_object, chunk_size=100): | |
"""Lazy function (generator) to read a file piece by piece. | |
Default chunk size: 100bytes. | |
source:https://stackoverflow.com/a/519653""" | |
while True: | |
data = file_object.read(chunk_size) | |
if not data: | |
break | |
yield data | |
with open(json_file, 'r', encoding='utf-8') as file_h: | |
chunk = next(read_in_chunks(file_h)) | |
# chunk is concatenated without break lines and [ ] | |
# with only a single space | |
text_read = ''.join([ | |
text_read, | |
chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '') | |
]) | |
# processing all text read until consume it all | |
while len(text_read) != 0: | |
# looking for json structures | |
try: | |
json_found = json_from_text(text_read) | |
except ValueError as e: | |
json_found = None | |
if json_found is not None: | |
lst_dct_jsons.append( | |
json.loads(json_found, encoding='utf-8') | |
) | |
# after json structure added | |
# must remove it from the text readed as commas and spaces | |
text_read = text_read.replace(json_found, '').replace(',', '', 1).strip() | |
try: | |
chunk = next(read_in_chunks(file_h)) | |
except StopIteration as e: | |
chunk = None | |
if chunk is not None: | |
text_read = ''.join([ | |
text_read, | |
chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '') | |
]) | |
file_h.close() | |
print('Founded json records/structures: {0}.'.format(len(lst_dct_jsons))) | |
print('Size of remain text from file(if exists):{0}'.format(len(text_read))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment