Created
March 29, 2022 00:17
-
-
Save codenamewei/87d027123349e322c020fd716c63ec34 to your computer and use it in GitHub Desktop.
split file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def split_by_size( | |
full_file_name_input=r'?'): | |
''' | |
Edit the following parameters: | |
full_file_name_input | |
output_file_size : edit the first number for desired size in Gigabytes | |
codepage | |
add_header_to_every_output_file | |
''' | |
import os.path | |
import io | |
# this means 1 Gb : 1 * 1024 * 1024 * 1024 = 1 Gigabyte | |
output_file_size = 1 * 1024 * 1024 * 1024 | |
# output_file_size = 250 * 1024 * 1024 # uncomment this to use size in megabytes | |
# # this means 250 Mb : 250 * 1024 * 1024 = 250 Megabytes | |
add_header_to_every_output_file = True | |
codepage = 'utf8' | |
file_name_input = '.'.join(os.path.basename( | |
full_file_name_input).split('.')[:-1]) | |
file_name_input_extension = os.path.basename( | |
full_file_name_input).split('.')[-1] | |
file_handles = dict() | |
read_bytes = 0 | |
lines = list() | |
lines_read = 0 | |
with io.open(full_file_name_input, encoding=codepage) as fin: | |
print("Start reading file...") | |
for i, line in enumerate(fin): | |
if i == 0: | |
header = line | |
continue | |
read_bytes += len(line) | |
lines_read += 1 | |
part_number = read_bytes // output_file_size + 1 | |
if not part_number in file_handles: | |
fnameout = os.path.join(os.path.dirname( | |
full_file_name_input), f'{file_name_input}_part_{part_number:02.0f}.{file_name_input_extension}') | |
file_handles[part_number] = io.open( | |
fnameout, 'w', encoding=codepage) | |
if part_number > 1: | |
print( | |
f'\t{lines_written:,.0f} lines were written into this file') | |
print(f'Created a file {fnameout}') | |
if add_header_to_every_output_file: | |
file_handles[part_number].write(header) | |
if part_number > 1: | |
lines.append(lines_written) | |
lines_written = 0 | |
file_handles[part_number].write(line) | |
lines_written += 1 | |
lines.append(lines_written) | |
print(f'\t{lines_written:,.0f} lines were written into this file') | |
for h in file_handles: | |
file_handles[h].close() | |
print(f'\nInitially, a file {full_file_name_input} contains {lines_read} lines except header. It was splitted into {len(lines)} files with {lines} lines respectively') | |
print( | |
f'\nThe sum of lines {"is EQUAL" if lines_read == sum(lines) else "is NOT EQUAL"} to the total number of lines in the initial file.') | |
if __name__ == '__main__': | |
split_by_size() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment