tehasdf/unibytes.py

## unibytes.py
# coding: utf-8
"""
Example of how to deal with user objects that can implement either __unicode__
or __str__, and we don't know which - eg. exceptions, into which the user
might've passed a bytes message or a unicode message.

One function shows how to convert both cases to bytes (str), the other function
shows how to convert both cases to unicode - depending what we need.

BTW note that it is much better to always know if we're dealing with bytes or
unicode, but it's not really possible with user-supplied objects :(
"""

# the `coding: utf-8` line is only needed because this file contains non-ascii
# characters, here:
UNICODE_DATA = u'łódź'
BYTES_DATA = b'łódź'
# we wouldn't need the `coding:` line if we used this instead:
UNICODE_DATA = u'\u0142\xf3d\u017a'
BYTES_DATA = '\xc5\x82\xc3\xb3d\xc5\xba'
# (which does the same thing)


# recap & explanation of bytes/unicode:
# u'x' is unicode, 'x' is bytes/str (in python 2), b'x' is also bytes/str

# uni.encode(encoding) -> bytes,  bytes.decode(encoding) -> unicode

# unicode(x) calls x.__unicode__, and if that doesn't exist, it calls

# x.__str__ and does a .decode() with the default encoding (ascii)

# str(x) calls x.__str__, doesn't call __unicode__, but if __str__ returns
# a unicode object, then it tries to .encode() with the default encoding

# note that: def x(self): return "ł" <-- this returns BYTES, not unicode
# however:   def x(self): return u"ł" <-- this returns unicode

# if you need a more in-depth explanation of bytes/unicode, see bit.ly/unipain

class A(object):
    def __str__(self):
        return BYTES_DATA


class B(object):
    def __unicode__(self):
        return UNICODE_DATA

a = A()
b = B()
v1 = ValueError(BYTES_DATA)
v2 = ValueError(UNICODE_DATA)


def f(x):
    """Given an object that can implement __str__ or __unicode__, but we dont
    know which, return bytes (str) of it.
    If the object implements __str__, just return that; if it implements
    __unicode__, encode with utf-8.
    """
    try:
        d = unicode(x)
    except UnicodeDecodeError:
        return str(x)
    return d.encode('utf-8')


def f2(x):
    """Given an object that can implement __str__ or __unicode__, but we dont
    know which, return unicode() of it.
    If the object implements __unicode__, just return that; if it implements
    __str__, decode with utf-8.
    """
    try:
        d = unicode(x)
    except UnicodeDecodeError:
        d = str(x).decode('utf-8')
    return d


assert f(a) == f(b) == f(v1) == f(v2) == BYTES_DATA
assert f2(a) == f2(b) == f2(v1) == f2(v2) == UNICODE_DATA
	# coding: utf-8
	"""
	Example of how to deal with user objects that can implement either __unicode__
	or __str__, and we don't know which - eg. exceptions, into which the user
	might've passed a bytes message or a unicode message.

	One function shows how to convert both cases to bytes (str), the other function
	shows how to convert both cases to unicode - depending what we need.

	BTW note that it is much better to always know if we're dealing with bytes or
	unicode, but it's not really possible with user-supplied objects :(
	"""

	# the `coding: utf-8` line is only needed because this file contains non-ascii
	# characters, here:
	UNICODE_DATA = u'łódź'
	BYTES_DATA = b'łódź'
	# we wouldn't need the `coding:` line if we used this instead:
	UNICODE_DATA = u'\u0142\xf3d\u017a'
	BYTES_DATA = '\xc5\x82\xc3\xb3d\xc5\xba'
	# (which does the same thing)


	# recap & explanation of bytes/unicode:
	# u'x' is unicode, 'x' is bytes/str (in python 2), b'x' is also bytes/str

	# uni.encode(encoding) -> bytes, bytes.decode(encoding) -> unicode

	# unicode(x) calls x.__unicode__, and if that doesn't exist, it calls

	# x.__str__ and does a .decode() with the default encoding (ascii)

	# str(x) calls x.__str__, doesn't call __unicode__, but if __str__ returns
	# a unicode object, then it tries to .encode() with the default encoding

	# note that: def x(self): return "ł" <-- this returns BYTES, not unicode
	# however: def x(self): return u"ł" <-- this returns unicode

	# if you need a more in-depth explanation of bytes/unicode, see bit.ly/unipain

	class A(object):
	def __str__(self):
	return BYTES_DATA


	class B(object):
	def __unicode__(self):
	return UNICODE_DATA

	a = A()
	b = B()
	v1 = ValueError(BYTES_DATA)
	v2 = ValueError(UNICODE_DATA)


	def f(x):
	"""Given an object that can implement __str__ or __unicode__, but we dont
	know which, return bytes (str) of it.
	If the object implements __str__, just return that; if it implements
	__unicode__, encode with utf-8.
	"""
	try:
	d = unicode(x)
	except UnicodeDecodeError:
	return str(x)
	return d.encode('utf-8')


	def f2(x):
	"""Given an object that can implement __str__ or __unicode__, but we dont
	know which, return unicode() of it.
	If the object implements __unicode__, just return that; if it implements
	__str__, decode with utf-8.
	"""
	try:
	d = unicode(x)
	except UnicodeDecodeError:
	d = str(x).decode('utf-8')
	return d


	assert f(a) == f(b) == f(v1) == f(v2) == BYTES_DATA
	assert f2(a) == f2(b) == f2(v1) == f2(v2) == UNICODE_DATA