Skip to content

Instantly share code, notes, and snippets.

@miraculixx
Last active November 22, 2021 15:57
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save miraculixx/84ac194c77f04817eeb3 to your computer and use it in GitHub Desktop.
Save miraculixx/84ac194c77f04817eeb3 to your computer and use it in GitHub Desktop.
Tired of Python's UnicodeDeocodeError, ascii codec can't decode? Here's how to fix it, once and for all.
# Python ascii codec can't decode and unicode mess
#
# check this out https://pythonhosted.org/kitchen/unicode-frustrations.html
# and this http://www.joelonsoftware.com/articles/Unicode.html
#
# The short of it is this
# 1. If you can, always set PYTHONIOENCODING=utf8 before you start your python programs.
# 2. If you can't or you can't ensure this, always use the following lambda _u to get unicode text
# whereever you convert to strings (str.format, str % etc.)
#
# So for 2. you always do this:
# _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
# _uu = lambda *tt: tuple(_u(t) for t in tt)
# # use like this
# text='Some string with codes > 127, like Zürich'
# print "Some unknown input %s" % _u(text)
# print "Multiple inputs %s, %s" % _uu(text, text)
# # or like this
# print u"Some string with codecs > 127 {}".format(_u(text))
# print u"Multiple inputs {}, {}".format(_uu(text, text))
# ---
# amazingly, you can assign UTF8 to strings, it works nicely
print "==> We'll work with these strings"
text = "Zürich"
utext = u"Zürich"
print text, type(text)
print utext, type(utext)
# will raise UnicodeDeocdeError
try:
print "==> Try as unicode(text) => error"
print unicode(text)
except UnicodeDecodeError as e:
print e
# try again
print "==> Try as with replacing unknown characters"
print unicode(text, errors="replace")
# again
print "==> Try as with ignoring unknown characters"
print unicode(text, errors='ignore')
# again
print "==> Try as with UTF-8 encoding"
print unicode(text, encoding="UTF-8")
# but this doesn't work!!
try:
print "But amazingly you can't encode unicode ext into UTF8"
print unicode(utext, encoding="UTF-8")
except Exception as e:
print "because %s" %e
# be smart
print "==> So you want unicode in string formats, huh?"
try:
print u"%s" % text
except UnicodeDecodeError as e:
print "Not so fast! %s" % e
# let's try that again
print "==> Now for real using string formats"
print u"%s" % unicode(text, 'UTF-8')
# so you think that's taking all the fun out of working with string formatting?
print "==> Doing it like this works more naturally"
print unicode("%s" % text, 'UTF-8')
try:
print "==> except if the string is already unicode:"
print unicode("%s" % utext, 'UTF-8')
except Exception as e:
print "sorry, no can do, %s" % e
# can we join, slice etc. such strings?
otext = "ü".join(text)
print otext
# even better, use export PYTHONIOENCODING=utf8, if you can at start up time
# see http://stackoverflow.com/a/27066059/890242
#---------
# let's deal with unicode itself
print "===> Unicode text can just be printed like that, because they are of type unicode"
print "%s" % utext, type("%s" % utext)
# but if you combine the two, the fun starts
print "===> Unicode + byte text cannot just be printed like that, because they are of type unicode"
try:
print "%s == %s" % (utext, text), type("%s" % utext)
except UnicodeDecodeError as e:
print "because, well, %s" % e
# try again
print "===> But you can covert them all to unicode or UTF8"
print unicode("%s == %s" % (utext, unicode(text, 'UTF-8')))
print "%s == %s" % (utext, unicode(text, 'UTF-8'))
try:
print "%s == %s" % (utext.decode('UTF-8'), text.decode('UTF-8'))
except UnicodeDecodeError as e:
print "no so fast... %s" % e
except Exception as e:
print "no so fast... %s" % e
# again
print "===> byte strings with codes > 127 are harder. You have to DECODE to utf8"
print text.decode('UTF-8')
try:
print text.encode('UTF-8')
except Exception as e:
print "because encode won't work, %s" % e
print "===> while you have to ENCODE unicodes. You can't decode unicode into UTF-8."
print "works:", utext.encode('UTF-8')
try:
print "doesn't:", utext.decode('UTF-8')
except Exception as e:
print "%s" % e
# in summary, if you know you want
print "==> So in summary, always use unicode(text, 'UTF-8') to ensure you have unicode text from byte code"
print unicode(text, 'UTF-8')
print "==> or save-get a unicode object from your inputs, whether unicode or byte string"
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
#else unicode(text.encode('UTF-8', 'replace'))
print _u(text), type(_u(text))
print _u(utext), type(_u(utext))
print "==> then you can mix match safely"
text = _u(text)
print "%s == %s" % (text, utext)
print u"%s == %s" % (text, utext)
# Conclusion: Always use this:
# guarantee unicode
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
_uu = lambda *tt: tuple(_u(t) for t in tt)
# guarantee byte string in UTF8
_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
_uu8 = lambda *tt: tuple(_u8(t) for t in tt)
text='Some string with codes > 127, like Zürich'
print "==> with _u, _uu"
print _u(text), type(_u(text))
print _u(utext), type(_u(utext))
print _uu(text, utext), type(_uu(text, utext))
print "==> with u8, uu8"
print _u8(text), type(_u8(text))
print _u8(utext), type(_u8(utext))
print _uu8(text, utext), type(_uu8(text, utext))
# with % formatting, always use _u() and _uu()
print "Some unknown input %s" % _u(text)
print "Multiple inputs %s, %s" % _uu(text, text)
# but with .format be sure to always work with unicode strings
print u"Also works with formats: {}".format(_u(text))
print u"Also works with formats: {},{}".format(*_uu(text, text))
# ... or use _u8 and _uu8, because string.format expects byte strings
print "Also works with formats: {}".format(_u8(text))
print "Also works with formats: {},{}".format(*_uu8(text, text))
# lambdas to convert into unicode and bytestring UTF8 from any basestring (unicode or str)
#
# Use as follows:
# text = u'Zürich'
# utext = 'Zürich'
# print "Text %s %s" % _uu(text, utext)
# print u"Text %s" % _u(text)
# print "Text {},{}".format(*_uu8(text, utext))
# print u"Text {},{}".format(*_uu(text, utext))
#
#
# guarantee unicode string
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
_uu = lambda *tt: tuple(_u(t) for t in tt)
# guarantee byte string in UTF8 encoding
_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
_uu8 = lambda *tt: tuple(_u8(t) for t in tt)
def main():
text='Some string with codes > 127, like Zürich'
utext=u'Some string with codes > 127, like Zürich'
print "==> with _u, _uu"
print _u(text), type(_u(text))
print _u(utext), type(_u(utext))
print _uu(text, utext), type(_uu(text, utext))
print "==> with u8, uu8"
print _u8(text), type(_u8(text))
print _u8(utext), type(_u8(utext))
print _uu8(text, utext), type(_uu8(text, utext))
# with % formatting, always use _u() and _uu()
print "Some unknown input %s" % _u(text)
print "Multiple inputs %s, %s" % _uu(text, text)
# but with .format be sure to always work with unicode strings
print u"Also works with formats: {}".format(_u(text))
print u"Also works with formats: {},{}".format(*_uu(text, text))
# ... or use _u8 and _uu8, because string.format expects byte strings
print "Also works with formats: {}".format(_u8(text))
print "Also works with formats: {},{}".format(*_uu8(text, text))
if __name__ == '__main__':
main()
# that's the actual mess with unicode() in Python. It only works on either bytestring or unicode input
# and you can't actually be sure what you get every single time
def doit(*args):
print "trying", args
text = args[0]
ttype = type(text)
try:
text = unicode(*args)
except:
print "==> DID NOT WORK: %s (type %s)" % (text, ttype)
else:
print "==> %s worked (type %s)" % (text, ttype)
# we want to have a uniform way to call unicode() for either byte string or unicode
# doesn't work
doit('zürich')
doit(u'zürich', 'utf-8')
# works
doit(u'zürich')
doit('zürich', 'utf-8')
## _u solves it by ensuring we have what we want
doit(_u('Zürich'))
doit(_u(u'Zürich'))
doit(_u8(u'Zürich'), 'utf-8')
doit(_u8('Zürich'), 'utf-8')
@miraculixx
Copy link
Author

great presentation explaining the nitty-gritty of Python's unicode pain by Ned Batchelder

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment