Skip to content

Instantly share code, notes, and snippets.

@rhinoceros
Forked from reorx/encoding_converter.py
Created November 14, 2016 14:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhinoceros/390398020d79c63b3fdd67daf73300b8 to your computer and use it in GitHub Desktop.
Save rhinoceros/390398020d79c63b3fdd67daf73300b8 to your computer and use it in GitHub Desktop.
char detect & convert
#!/usr/bin/python
# -*- coding: utf-8 -*-
import chardet
BROTHER_ENCODINGS = [
('GB2312', 'GBK', 'GB18030'),
]
def strict_detect(text):
"""
Detect the encoding of a string strictly,
if no exact encoding ('decode()' success) was found, an exception will be raise.
return a str represent of the encoding.
"""
def get_brothers(ec):
for i in BROTHER_ENCODINGS:
if ec in i:
return i
return None
res = chardet.detect(text)
if res['confidence'] < 1:
possible_encodings = get_brothers(res['encoding']) or (res['encoding'], )
else:
possible_encodings = (res['encoding'], )
for ec in possible_encodings:
try:
text.decode(ec)
except UnicodeDecodeError:
pass
else:
return ec
raise DetectionFailed('raw result from chardet: %s' % res)
class DetectionFailed(Exception):
pass
def convert_string(s, to_ec, from_ec=None):
"""
convert a string to an indicated encoding,
if no source encoding was given, try to detect it.
"""
if not from_ec:
from_ec = strict_detect(s)
return s.decode(from_ec).encode(to_ec)
def convert_file(fname, to_ec, from_ec=None, replace=False, to_name=None):
"""
convert a file to an indicated encoding.
Arguments::
:from_ec
if no source encoding was given, try to detect it.
:replace
if replace was set to True, it will cover to_name argument,
generate a new encoded file to replace the previous one.
:to_name
only effective when replace was False,
if to_name was given, use it as the new file's name,
if to_name was not given, try to generate a new name based on the old name and encoding.
"""
if replace:
to_name = fname
else:
if not to_name:
fname_parts = fname.split('.')
if len(fname_parts) < 2:
fpre = fname
fext = ''
else:
fpre = ''.join(fname_parts[:-1])
fext = fname_parts[-1]
to_name = '%s_%s.%s' % (fpre, to_ec, fext)
# print('convert %s to %s from encoding %s to %s')
# print ('done')
with open(fname, 'r') as source:
buf = source.read()
assert buf, 'file to convert should have content'
with open(to_name, 'w') as target:
target.write(convert_string(buf, to_ec, from_ec))
def main():
import optparse
parser = optparse.OptionParser(usage='Usage: %prog [options] file_to_convert [, new_file_name]')
parser.add_option('-f', '--from-ec', type='str',
help='the original encoding of the file, if not given, detecting will be tried')
parser.add_option('-t', '--to-ec', type='str', default='utf8',
help='the output encoding of the new file, if not given, use utf8 as default')
parser.add_option('-p', '--replace', action='store_true', default=False,
help='if indicated, the new file will replace the old one, use it carefully')
options, args = parser.parse_args()
if len(args) == 1:
fname = args[0]
to_name = None
elif len(args) == 2:
fname = args[0]
to_name = args[1]
else:
parser.error('incorrect number of arguments, must be one or two')
convert_file(fname, options.to_ec,
from_ec=options.from_ec,
replace=options.replace,
to_name=to_name)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment