arcanosam/re_json_file.py

## re_json_file.py
"""reading from a json file in chunks and added each json structure on a list
"""
import re
import json

# path to a json file
json_file = '<<fill with a complete path to a json file>>'

# this variable hold text read from file until found a json structure
text_read = ''

# every json structure found is added as dict on a list
lst_dct_jsons = []

# this variable hold json structure found
json_founded = ''


def json_from_text(txt):
    """
    :param txt: json string read from a json file
    :return: one json only structure found
    source:https://stackoverflow.com/a/34960703
    """
    match = re.findall('{.*?}', txt)

    return match[0] if match else None


def read_in_chunks(file_object, chunk_size=100):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 100bytes.
    source:https://stackoverflow.com/a/519653"""

    while True:
        data = file_object.read(chunk_size)

        if not data:
            break

        yield data

with open(json_file, 'r', encoding='utf-8') as file_h:

    chunk = next(read_in_chunks(file_h))

    # chunk is concatenated without break lines and [ ]
    # with only a single space

    text_read = ''.join([
        text_read,
        chunk.replace("\n", '').replace('  ', '').replace('[', '').replace(']', '')

    ])

    # processing all text read until consume it all
    while len(text_read) != 0:

        # looking for json structures

        try:
            json_found = json_from_text(text_read)
        except ValueError as e:
            json_found = None

        if json_found is not None:
            lst_dct_jsons.append(
                json.loads(json_found, encoding='utf-8')
            )

            # after json structure added
            # must remove it from the text readed as commas and spaces
            text_read = text_read.replace(json_found, '').replace(',', '', 1).strip()

        try:
            chunk = next(read_in_chunks(file_h))
        except StopIteration as e:
            chunk = None

        if chunk is not None:
            text_read = ''.join([
                text_read,
                chunk.replace("\n", '').replace('  ', '').replace('[', '').replace(']', '')

            ])

file_h.close()

print('Founded json records/structures: {0}.'.format(len(lst_dct_jsons)))
print('Size of remain text from file(if exists):{0}'.format(len(text_read)))
	"""reading from a json file in chunks and added each json structure on a list
	"""
	import re
	import json

	# path to a json file
	json_file = '<<fill with a complete path to a json file>>'

	# this variable hold text read from file until found a json structure
	text_read = ''

	# every json structure found is added as dict on a list
	lst_dct_jsons = []

	# this variable hold json structure found
	json_founded = ''


	def json_from_text(txt):
	"""
	:param txt: json string read from a json file
	:return: one json only structure found
	source:https://stackoverflow.com/a/34960703
	"""
	match = re.findall('{.*?}', txt)

	return match[0] if match else None


	def read_in_chunks(file_object, chunk_size=100):
	"""Lazy function (generator) to read a file piece by piece.
	Default chunk size: 100bytes.
	source:https://stackoverflow.com/a/519653"""

	while True:
	data = file_object.read(chunk_size)

	if not data:
	break

	yield data

	with open(json_file, 'r', encoding='utf-8') as file_h:

	chunk = next(read_in_chunks(file_h))

	# chunk is concatenated without break lines and [ ]
	# with only a single space

	text_read = ''.join([
	text_read,
	chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '')

	])

	# processing all text read until consume it all
	while len(text_read) != 0:

	# looking for json structures

	try:
	json_found = json_from_text(text_read)
	except ValueError as e:
	json_found = None

	if json_found is not None:
	lst_dct_jsons.append(
	json.loads(json_found, encoding='utf-8')
	)

	# after json structure added
	# must remove it from the text readed as commas and spaces
	text_read = text_read.replace(json_found, '').replace(',', '', 1).strip()

	try:
	chunk = next(read_in_chunks(file_h))
	except StopIteration as e:
	chunk = None

	if chunk is not None:
	text_read = ''.join([
	text_read,
	chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '')

	])

	file_h.close()

	print('Founded json records/structures: {0}.'.format(len(lst_dct_jsons)))
	print('Size of remain text from file(if exists):{0}'.format(len(text_read)))