Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Read large files line by line without loading entire file to memory. Supports files of GB size
def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
"""
read file line by line regardless of its size
:param file_name: absolute path of file to read
:param chunk_size: size of data to be read at at time
:param callback: callback method, prototype ----> def callback(data, eof, file_name)
:return:
"""
def read_in_chunks(file_obj, chunk_size=5000):
"""
https://stackoverflow.com/a/519653/5130720
Lazy function to read a file
Default chunk size: 5000.
"""
while True:
data = file_obj.read(chunk_size)
if not data:
break
yield data
fp = open(file_name)
data_left_over = None
# loop through characters
for chunk in read_in_chunks(fp):
# if uncompleted data exists
if data_left_over:
# print('\n left over found')
current_chunk = data_left_over + chunk
else:
current_chunk = chunk
# split chunk by new line
lines = current_chunk.splitlines()
# check if line is complete
if current_chunk.endswith('\n'):
data_left_over = None
else:
data_left_over = lines.pop()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else:
for line in lines:
callback(data=line, eof=False, file_name=file_name)
pass
if data_left_over:
current_chunk = data_left_over
if current_chunk is not None:
lines = current_chunk.splitlines()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else :
for line in lines:
callback(data=line, eof=False, file_name=file_name)
pass
callback(data=None, eof=True, file_name=file_name)
import data_loading_utils.py
file_name = 'file_name.ext'
CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration
# callback method
def process_lines(data, eof, file_name):
# check if end of file reached
if not eof:
# process data, data is one single line of the file
else:
# end of file reached
if __name__ == "__main__":
data_loading_utils.read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=self.process_lines)
# process_lines method is the callback method.
# It will be called for all the lines, with parameter data representing one single line of the file at a time
@akushyn

This comment has been minimized.

Copy link

@akushyn akushyn commented Dec 29, 2019

Great and very fast implementation!
Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment