metachris/make_compat_str.py

## make_compat_str.py
"""
Detect encoding of bytes/str and return unicode object.
Compatible with Python 2 and 3

Dependencies:
- chardet (https://pypi.python.org/pypi/chardet)

Author: Chris Hager <chris@linuxuser.at> https://www.metachris.com
License: Public Domain (The Unlicense)
"""

import sys
import chardet


IS_PY2 = sys.version_info < (3, 0)
if not IS_PY2:
    # Python 3 fix
    unicode = str


def make_compat_str(in_str):
    """
    Tries to guess encoding of [str/bytes] and
    return a standard unicode string
    """
    assert isinstance(in_str, (bytes, str, unicode))
    if not in_str:
        return unicode()

    # Chardet in Py2 works on str + bytes objects
    if IS_PY2 and isinstance(in_str, unicode):
        return in_str

    # Chardet in Py3 works on bytes objects
    if not IS_PY2 and not isinstance(in_str, bytes):
        return in_str

    # Detect the encoding now
    enc = chardet.detect(in_str)

    # Decode the object into a unicode object
    out_str = in_str.decode(enc['encoding'])

    # Cleanup: Sometimes UTF-16 strings include the BOM
    if enc['encoding'] == "UTF-16BE":
        # Remove byte order marks (BOM)
        if out_str.startswith('\ufeff'):
            out_str = out_str[1:]

    # Return the decoded string
    return out_str
	"""
	Detect encoding of bytes/str and return unicode object.
	Compatible with Python 2 and 3

	Dependencies:
	- chardet (https://pypi.python.org/pypi/chardet)

	Author: Chris Hager <chris@linuxuser.at> https://www.metachris.com
	License: Public Domain (The Unlicense)
	"""

	import sys
	import chardet


	IS_PY2 = sys.version_info < (3, 0)
	if not IS_PY2:
	# Python 3 fix
	unicode = str


	def make_compat_str(in_str):
	"""
	Tries to guess encoding of [str/bytes] and
	return a standard unicode string
	"""
	assert isinstance(in_str, (bytes, str, unicode))
	if not in_str:
	return unicode()

	# Chardet in Py2 works on str + bytes objects
	if IS_PY2 and isinstance(in_str, unicode):
	return in_str

	# Chardet in Py3 works on bytes objects
	if not IS_PY2 and not isinstance(in_str, bytes):
	return in_str

	# Detect the encoding now
	enc = chardet.detect(in_str)

	# Decode the object into a unicode object
	out_str = in_str.decode(enc['encoding'])

	# Cleanup: Sometimes UTF-16 strings include the BOM
	if enc['encoding'] == "UTF-16BE":
	# Remove byte order marks (BOM)
	if out_str.startswith('\ufeff'):
	out_str = out_str[1:]

	# Return the decoded string
	return out_str