Created
March 20, 2013 20:18
-
-
Save mhrivnak/5208064 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ENCODING_LIST = ('utf8', 'iso-8859-1', 'cp1252') | |
def string_to_unicode(data, destructive=False): | |
""" | |
Make a best effort to decode a string, trying encodings in a sensible order | |
based on unscientific expectations of each one's probability of use. | |
:param data: string to decode | |
:type data: str | |
:param destructive: iff True, if all else fails, decodes as utf8 and omits | |
characters that cannot be parsed. Default is False. | |
:type destructive: bool | |
:return: data as a unicode object | |
:rtype: unicode | |
""" | |
if not data: | |
return data | |
for code in ENCODING_LIST: | |
try: | |
return data.decode(code) | |
except UnicodeDecodeError: | |
# try others | |
continue | |
if destructive: | |
# omits any characters it cannot parse | |
ret = data.decode('utf8', 'ignore') | |
_LOG.error('string decode error: omitted unparsable chars in: %s' % ret) | |
return ret | |
else: | |
raise UnicodeDecodeError |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment