Skip to content

Instantly share code, notes, and snippets.

@MHM5000
Forked from iyvinjose/data_loading_utils.py
Created October 8, 2019 05:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MHM5000/4d1db2f28b2765149abf95a8331d9df4 to your computer and use it in GitHub Desktop.
Save MHM5000/4d1db2f28b2765149abf95a8331d9df4 to your computer and use it in GitHub Desktop.
Read large files line by line without loading entire file to memory. Supports files of GB size
def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
"""
read file line by line regardless of its size
:param file_name: absolute path of file to read
:param chunk_size: size of data to be read at at time
:param callback: callback method, prototype ----> def callback(data, eof, file_name)
:return:
"""
def read_in_chunks(file_obj, chunk_size=5000):
"""
https://stackoverflow.com/a/519653/5130720
Lazy function to read a file
Default chunk size: 5000.
"""
while True:
data = file_obj.read(chunk_size)
if not data:
break
yield data
fp = open(file_name)
data_left_over = None
# loop through characters
for chunk in read_in_chunks(fp):
# if uncompleted data exists
if data_left_over:
# print('\n left over found')
current_chunk = data_left_over + chunk
else:
current_chunk = chunk
# split chunk by new line
lines = current_chunk.splitlines()
# check if line is complete
if current_chunk.endswith('\n'):
data_left_over = None
else:
data_left_over = lines.pop()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else:
for line in lines:
callback(data=line, eof=False, file_name=file_name)
pass
if data_left_over:
current_chunk = data_left_over
if current_chunk is not None:
lines = current_chunk.splitlines()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else :
for line in lines:
callback(data=line, eof=False, file_name=file_name)
pass
callback(data=None, eof=True, file_name=file_name)
import data_loading_utils.py.py
file_name = 'file_name.ext'
CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration
# callback method
def process_lines(data, eof, file_name):
# check if end of file reached
if not eof:
# process data, data is one single line of the file
else:
# end of file reached
if __name__ == "__main__":
data_loading_utils.read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=self.process_lines)
# process_lines method is the callback method.
# It will be called for all the lines, with parameter data representing one single line of the file at a time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment