Skip to content

Instantly share code, notes, and snippets.

@iyvinjose
Last active November 21, 2023 13:29
Show Gist options
  • Star 19 You must be signed in to star a gist
  • Fork 13 You must be signed in to fork a gist
  • Save iyvinjose/e6c1cb2821abd5f01fd1b9065cbc759d to your computer and use it in GitHub Desktop.
Save iyvinjose/e6c1cb2821abd5f01fd1b9065cbc759d to your computer and use it in GitHub Desktop.
Read large files line by line without loading entire file to memory. Supports files of GB size
def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
"""
read file line by line regardless of its size
:param file_name: absolute path of file to read
:param chunk_size: size of data to be read at at time
:param callback: callback method, prototype ----> def callback(data, eof, file_name)
:return:
"""
def read_in_chunks(file_obj, chunk_size=5000):
"""
https://stackoverflow.com/a/519653/5130720
Lazy function to read a file
Default chunk size: 5000.
"""
while True:
data = file_obj.read(chunk_size)
if not data:
break
yield data
fp = open(file_name)
data_left_over = None
# loop through characters
for chunk in read_in_chunks(fp):
# if uncompleted data exists
if data_left_over:
# print('\n left over found')
current_chunk = data_left_over + chunk
else:
current_chunk = chunk
# split chunk by new line
lines = current_chunk.splitlines()
# check if line is complete
if current_chunk.endswith('\n'):
data_left_over = None
else:
data_left_over = lines.pop()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else:
for line in lines:
callback(data=line, eof=False, file_name=file_name)
pass
if data_left_over:
current_chunk = data_left_over
if current_chunk is not None:
lines = current_chunk.splitlines()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else :
for line in lines:
callback(data=line, eof=False, file_name=file_name)
pass
callback(data=None, eof=True, file_name=file_name)
import data_loading_utils.py
file_name = 'file_name.ext'
CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration
# callback method
def process_lines(data, eof, file_name):
# check if end of file reached
if not eof:
# process data, data is one single line of the file
else:
# end of file reached
if __name__ == "__main__":
data_loading_utils.read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=self.process_lines)
# process_lines method is the callback method.
# It will be called for all the lines, with parameter data representing one single line of the file at a time
@akushyn
Copy link

akushyn commented Dec 29, 2019

Great and very fast implementation!
Thank you!

@diana13bg
Copy link

Hi, I have a question. In my case I am extracting an array of desired line elements while reading each line.
How can I return the array outside the: def process_lines(data, eof, file_name) and continue building my new reconfigured dataset?

I tried: to add "return new_list" to the def process_lines(data, eof, file_name), but when I try to access it outside the function as:
new_list = process_lines(data, eof, file_name), it gives me error: NameError: name 'eof' is not defined.

Please, advise. Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment