Created
March 3, 2016 08:47
-
-
Save coder-tan/cd127c6563dc702b86bf to your computer and use it in GitHub Desktop.
convert text file encodings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# convert text file encoding | |
import codecs | |
import os | |
from shutil import copyfile | |
encoding_china = 'gb18030' | |
encoding_gbk = 'gbk' | |
encoding_source = encoding_china | |
encoding_target = 'utf-8' | |
def con(source, target): | |
blocksize = 1048576 # or some other, desired size in bytes | |
try: | |
with codecs.open(source, "r", encoding_source) as sourceFile: | |
with codecs.open(target, "w", encoding_target) as targetFile: | |
while True: | |
contents = sourceFile.read(blocksize) | |
if not contents: | |
break | |
targetFile.write(contents) | |
except UnicodeDecodeError: | |
copyfile(source, target) | |
pass | |
def test(): | |
source = '/Users/tan/tan/dev/python/dian/source' | |
conn(source) | |
check(source) | |
check(source.replace('source', 'target')) | |
def check(path): | |
files = [] | |
count_dir(files, path) | |
print(len(files)) | |
def conn(path): | |
if not os.path.isfile(path): | |
target = path.replace('source', 'target') | |
if not os.path.exists(target): | |
os.mkdir(target) | |
for line in os.listdir(path): | |
conn(path + '/' + line) | |
else: | |
con(path, path.replace('source', 'target')) | |
print(path) | |
def count_dir(files, file): | |
if not os.path.isfile(file): | |
for line in os.listdir(file): | |
count_dir(files, file + '/' + line) | |
else: | |
files.append(file) | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment