Skip to content

Instantly share code, notes, and snippets.

@aljiwala
Last active July 24, 2018 08:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aljiwala/a6973b7582f1870927842e565f1f77cb to your computer and use it in GitHub Desktop.
Save aljiwala/a6973b7582f1870927842e565f1f77cb to your computer and use it in GitHub Desktop.
Divide CSV data into chunks
# Built-in imports.
from sys import argv
from os.path import join as join_path
# Third party imports.
from pandas import read_csv
def get_row_count(src_filepath):
with open(src_filepath, 'r') as f:
return sum(1 for row in f)
def divide_in_chunks(chunksize, src_filepath, dst_filepath):
count = 0
tfr = read_csv(
src_filepath, iterator=True, chunksize=chunksize, index_col=0
)
for chunk in tfr:
count += 1
chunk.to_csv(join_path(dst_filepath, 'part{}.csv'.format(count)))
def main():
chunksize, src_filepath, dst_filepath = 100, '', ''
chunksize_arg, src_filepath_arg, dst_filepath_arg =\
'--chunksize', '--src_filepath', '--dst_filepath'
for arg in argv:
if arg.startswith(chunksize_arg):
chunksize = int(arg.split(chunksize_arg+'=')[1])
elif arg.startswith(src_filepath_arg):
src_filepath = arg.split(src_filepath_arg+'=')[1]
elif arg.startswith(dst_filepath_arg):
dst_filepath = arg.split(dst_filepath_arg+'=')[1]
else:
pass
if src_filepath == '':
print(src_filepath_arg + ' isn\'t provided.')
return
if dst_filepath == '':
print(dst_filepath_arg + ' isn\'t provided.')
return
divide_in_chunks(chunksize, src_filepath, dst_filepath)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment