Skip to content

Instantly share code, notes, and snippets.

@metachris
Created November 23, 2015 10:10
Show Gist options
  • Save metachris/3eaa2d6b2ba820cb8c3b to your computer and use it in GitHub Desktop.
Save metachris/3eaa2d6b2ba820cb8c3b to your computer and use it in GitHub Desktop.
"""
Detect encoding of bytes/str and return unicode object.
Compatible with Python 2 and 3
Dependencies:
- chardet (https://pypi.python.org/pypi/chardet)
Author: Chris Hager <chris@linuxuser.at> https://www.metachris.com
License: Public Domain (The Unlicense)
"""
import sys
import chardet
IS_PY2 = sys.version_info < (3, 0)
if not IS_PY2:
# Python 3 fix
unicode = str
def make_compat_str(in_str):
"""
Tries to guess encoding of [str/bytes] and
return a standard unicode string
"""
assert isinstance(in_str, (bytes, str, unicode))
if not in_str:
return unicode()
# Chardet in Py2 works on str + bytes objects
if IS_PY2 and isinstance(in_str, unicode):
return in_str
# Chardet in Py3 works on bytes objects
if not IS_PY2 and not isinstance(in_str, bytes):
return in_str
# Detect the encoding now
enc = chardet.detect(in_str)
# Decode the object into a unicode object
out_str = in_str.decode(enc['encoding'])
# Cleanup: Sometimes UTF-16 strings include the BOM
if enc['encoding'] == "UTF-16BE":
# Remove byte order marks (BOM)
if out_str.startswith('\ufeff'):
out_str = out_str[1:]
# Return the decoded string
return out_str
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment