Skip to content

Instantly share code, notes, and snippets.

@codenamewei
Created March 29, 2022 00:17
Show Gist options
  • Save codenamewei/87d027123349e322c020fd716c63ec34 to your computer and use it in GitHub Desktop.
Save codenamewei/87d027123349e322c020fd716c63ec34 to your computer and use it in GitHub Desktop.
split file
def split_by_size(
full_file_name_input=r'?'):
'''
Edit the following parameters:
full_file_name_input
output_file_size : edit the first number for desired size in Gigabytes
codepage
add_header_to_every_output_file
'''
import os.path
import io
# this means 1 Gb : 1 * 1024 * 1024 * 1024 = 1 Gigabyte
output_file_size = 1 * 1024 * 1024 * 1024
# output_file_size = 250 * 1024 * 1024 # uncomment this to use size in megabytes
# # this means 250 Mb : 250 * 1024 * 1024 = 250 Megabytes
add_header_to_every_output_file = True
codepage = 'utf8'
file_name_input = '.'.join(os.path.basename(
full_file_name_input).split('.')[:-1])
file_name_input_extension = os.path.basename(
full_file_name_input).split('.')[-1]
file_handles = dict()
read_bytes = 0
lines = list()
lines_read = 0
with io.open(full_file_name_input, encoding=codepage) as fin:
print("Start reading file...")
for i, line in enumerate(fin):
if i == 0:
header = line
continue
read_bytes += len(line)
lines_read += 1
part_number = read_bytes // output_file_size + 1
if not part_number in file_handles:
fnameout = os.path.join(os.path.dirname(
full_file_name_input), f'{file_name_input}_part_{part_number:02.0f}.{file_name_input_extension}')
file_handles[part_number] = io.open(
fnameout, 'w', encoding=codepage)
if part_number > 1:
print(
f'\t{lines_written:,.0f} lines were written into this file')
print(f'Created a file {fnameout}')
if add_header_to_every_output_file:
file_handles[part_number].write(header)
if part_number > 1:
lines.append(lines_written)
lines_written = 0
file_handles[part_number].write(line)
lines_written += 1
lines.append(lines_written)
print(f'\t{lines_written:,.0f} lines were written into this file')
for h in file_handles:
file_handles[h].close()
print(f'\nInitially, a file {full_file_name_input} contains {lines_read} lines except header. It was splitted into {len(lines)} files with {lines} lines respectively')
print(
f'\nThe sum of lines {"is EQUAL" if lines_read == sum(lines) else "is NOT EQUAL"} to the total number of lines in the initial file.')
if __name__ == '__main__':
split_by_size()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment