Skip to content

Instantly share code, notes, and snippets.

@hanx11
Last active October 19, 2019 05:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hanx11/500e79465b085caecf763881689e6459 to your computer and use it in GitHub Desktop.
Save hanx11/500e79465b085caecf763881689e6459 to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*-
from datetime import datetime
def read_many_lines(fp, buffer_size=10000):
"""
:param fp:
:param buffer_size: default 10000
:return:
"""
n = 1
line = fp.readline()
lines = []
while line and n <= buffer_size:
lines.append(line)
line = fp.readline()
n = n + 1
return lines
def process_big_file():
start_time = datetime.now()
read_file = open('big_gbk_file.csv', 'rb')
lines = read_many_lines(read_file)
while lines:
lines = [line.decode('GBK').encode('utf8') for line in lines]
print(len(lines))
with open('big_utf8_file.csv', 'ab') as write_file:
content = b''.join(lines)
write_file.write(content)
lines = read_many_lines(read_file)
end_time = datetime.now()
read_file.close()
print('Cost time: {}'.format(end_time - start_time))
def main():
process_big_file()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment