Skip to content

Instantly share code, notes, and snippets.

@vivisidea
Created March 31, 2013 09:37
Show Gist options
  • Save vivisidea/5280131 to your computer and use it in GitHub Desktop.
Save vivisidea/5280131 to your computer and use it in GitHub Desktop.
linux's iconv tool SUCKS when processing Chinese character building my own simple file encoding conversion tools using python. make sure you know the original encoding
#!/usr/bin/env python
# -*- encoding=utf-8 -*-
#
# convert files from encoding to another encoding
#
import os
directory = '.'
surfix='.txt'
fromenc='gbk'
toenc='utf8'
printmode=False # use printmode=True is you are not sure.
def convert(directory, surfix='', fromenc='gbk', toenc='utf8'):
'''
convert the files match surfix in directory from $fromenc to $toenc, recursively.
'''
if os.path.isdir(directory):
for f in os.listdir(directory):
f = os.path.join(directory, f) # relative path
if os.path.isdir(f):
print '%s is a directory ...' % f
convert(f, surfix, fromenc, toenc)
elif f.endswith(surfix):
fobj = open(f, 'rb')
if printmode:
for line in fobj:
line = line.decode(fromenc,'ignore').encode(toenc)
print line,
fobj.close()
else:
tmpfile = f+'.'+toenc
tmp = open(tmpfile, 'wb') # write the output to a temorary file
try:
for line in fobj:
line = line.decode(fromenc, 'ignore').encode(toenc) # ignore the conversion error, replace with '?' mark
tmp.write(line)
except Exception, e:
print 'error processing %s, error=%s' % (f,e)
if os.path.exists(tmpfile):
os.remove(tmpfile)
print 'processing %s ...' % f
fobj.close()
tmp.close()
if os.path.exists(tmpfile):
os.rename(tmpfile, f) # override the original file, DO YOUR OWN BACKUP !!!
if __name__ == '__main__':
choice = raw_input('it\'s highly recommended that you do your own backup before continue, continue?(Y/N)')
if choice.lower() == 'y':
convert('.', '.txt', 'gbk', 'utf8')
else:
print 'aborted.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment