Skip to content

Instantly share code, notes, and snippets.

@c3333
Forked from miraculixx/asciidecodeerror.py
Created July 22, 2020 15:02
Show Gist options
  • Save c3333/63b1683fd04e1149bc1b715e32354c30 to your computer and use it in GitHub Desktop.
Save c3333/63b1683fd04e1149bc1b715e32354c30 to your computer and use it in GitHub Desktop.
Tired of Python's UnicodeDeocodeError, ascii codec can't decode? Here's how to fix it, once and for all.
# Python ascii codec can't decode and unicode mess
#
# check this out https://pythonhosted.org/kitchen/unicode-frustrations.html
# and this http://www.joelonsoftware.com/articles/Unicode.html
#
# The short of it is this
# 1. If you can, always set PYTHONIOENCODING=utf8 before you start your python programs.
# 2. If you can't or you can't ensure this, always use the following lambda _u to get unicode text
# whereever you convert to strings (str.format, str % etc.)
#
# So for 2. you always do this:
# _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
# _uu = lambda *tt: tuple(_u(t) for t in tt)
# # use like this
# text='Some string with codes > 127, like Zürich'
# print "Some unknown input %s" % _u(text)
# print "Multiple inputs %s, %s" % _uu(text, text)
# # or like this
# print u"Some string with codecs > 127 {}".format(_u(text))
# print u"Multiple inputs {}, {}".format(_uu(text, text))
# ---
# amazingly, you can assign UTF8 to strings, it works nicely
print "==> We'll work with these strings"
text = "Zürich"
utext = u"Zürich"
print text, type(text)
print utext, type(utext)
# will raise UnicodeDeocdeError
try:
print "==> Try as unicode(text) => error"
print unicode(text)
except UnicodeDecodeError as e:
print e
# try again
print "==> Try as with replacing unknown characters"
print unicode(text, errors="replace")
# again
print "==> Try as with ignoring unknown characters"
print unicode(text, errors='ignore')
# again
print "==> Try as with UTF-8 encoding"
print unicode(text, encoding="UTF-8")
# but this doesn't work!!
try:
print "But amazingly you can't encode unicode ext into UTF8"
print unicode(utext, encoding="UTF-8")
except Exception as e:
print "because %s" %e
# be smart
print "==> So you want unicode in string formats, huh?"
try:
print u"%s" % text
except UnicodeDecodeError as e:
print "Not so fast! %s" % e
# let's try that again
print "==> Now for real using string formats"
print u"%s" % unicode(text, 'UTF-8')
# so you think that's taking all the fun out of working with string formatting?
print "==> Doing it like this works more naturally"
print unicode("%s" % text, 'UTF-8')
try:
print "==> except if the string is already unicode:"
print unicode("%s" % utext, 'UTF-8')
except Exception as e:
print "sorry, no can do, %s" % e
# can we join, slice etc. such strings?
otext = "ü".join(text)
print otext
# even better, use export PYTHONIOENCODING=utf8, if you can at start up time
# see http://stackoverflow.com/a/27066059/890242
#---------
# let's deal with unicode itself
print "===> Unicode text can just be printed like that, because they are of type unicode"
print "%s" % utext, type("%s" % utext)
# but if you combine the two, the fun starts
print "===> Unicode + byte text cannot just be printed like that, because they are of type unicode"
try:
print "%s == %s" % (utext, text), type("%s" % utext)
except UnicodeDecodeError as e:
print "because, well, %s" % e
# try again
print "===> But you can covert them all to unicode or UTF8"
print unicode("%s == %s" % (utext, unicode(text, 'UTF-8')))
print "%s == %s" % (utext, unicode(text, 'UTF-8'))
try:
print "%s == %s" % (utext.decode('UTF-8'), text.decode('UTF-8'))
except UnicodeDecodeError as e:
print "no so fast... %s" % e
except Exception as e:
print "no so fast... %s" % e
# again
print "===> byte strings with codes > 127 are harder. You have to DECODE to utf8"
print text.decode('UTF-8')
try:
print text.encode('UTF-8')
except Exception as e:
print "because encode won't work, %s" % e
print "===> while you have to ENCODE unicodes. You can't decode unicode into UTF-8."
print "works:", utext.encode('UTF-8')
try:
print "doesn't:", utext.decode('UTF-8')
except Exception as e:
print "%s" % e
# in summary, if you know you want
print "==> So in summary, always use unicode(text, 'UTF-8') to ensure you have unicode text from byte code"
print unicode(text, 'UTF-8')
print "==> or save-get a unicode object from your inputs, whether unicode or byte string"
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
#else unicode(text.encode('UTF-8', 'replace'))
print _u(text), type(_u(text))
print _u(utext), type(_u(utext))
print "==> then you can mix match safely"
text = _u(text)
print "%s == %s" % (text, utext)
print u"%s == %s" % (text, utext)
# Conclusion: Always use this:
# guarantee unicode
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
_uu = lambda *tt: tuple(_u(t) for t in tt)
# guarantee byte string in UTF8
_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
_uu8 = lambda *tt: tuple(_u8(t) for t in tt)
text='Some string with codes > 127, like Zürich'
print "==> with _u, _uu"
print _u(text), type(_u(text))
print _u(utext), type(_u(utext))
print _uu(text, utext), type(_uu(text, utext))
print "==> with u8, uu8"
print _u8(text), type(_u8(text))
print _u8(utext), type(_u8(utext))
print _uu8(text, utext), type(_uu8(text, utext))
# with % formatting, always use _u() and _uu()
print "Some unknown input %s" % _u(text)
print "Multiple inputs %s, %s" % _uu(text, text)
# but with .format be sure to always work with unicode strings
print u"Also works with formats: {}".format(_u(text))
print u"Also works with formats: {},{}".format(*_uu(text, text))
# ... or use _u8 and _uu8, because string.format expects byte strings
print "Also works with formats: {}".format(_u8(text))
print "Also works with formats: {},{}".format(*_uu8(text, text))
# lambdas to convert into unicode and bytestring UTF8 from any basestring (unicode or str)
#
# Use as follows:
# text = u'Zürich'
# utext = 'Zürich'
# print "Text %s %s" % _uu(text, utext)
# print u"Text %s" % _u(text)
# print "Text {},{}".format(*_uu8(text, utext))
# print u"Text {},{}".format(*_uu(text, utext))
#
#
# guarantee unicode string
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
_uu = lambda *tt: tuple(_u(t) for t in tt)
# guarantee byte string in UTF8 encoding
_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
_uu8 = lambda *tt: tuple(_u8(t) for t in tt)
def main():
text='Some string with codes > 127, like Zürich'
utext=u'Some string with codes > 127, like Zürich'
print "==> with _u, _uu"
print _u(text), type(_u(text))
print _u(utext), type(_u(utext))
print _uu(text, utext), type(_uu(text, utext))
print "==> with u8, uu8"
print _u8(text), type(_u8(text))
print _u8(utext), type(_u8(utext))
print _uu8(text, utext), type(_uu8(text, utext))
# with % formatting, always use _u() and _uu()
print "Some unknown input %s" % _u(text)
print "Multiple inputs %s, %s" % _uu(text, text)
# but with .format be sure to always work with unicode strings
print u"Also works with formats: {}".format(_u(text))
print u"Also works with formats: {},{}".format(*_uu(text, text))
# ... or use _u8 and _uu8, because string.format expects byte strings
print "Also works with formats: {}".format(_u8(text))
print "Also works with formats: {},{}".format(*_uu8(text, text))
if __name__ == '__main__':
main()
# that's the actual mess with unicode() in Python. It only works on either bytestring or unicode input
# and you can't actually be sure what you get every single time
def doit(*args):
print "trying", args
text = args[0]
ttype = type(text)
try:
text = unicode(*args)
except:
print "==> DID NOT WORK: %s (type %s)" % (text, ttype)
else:
print "==> %s worked (type %s)" % (text, ttype)
# we want to have a uniform way to call unicode() for either byte string or unicode
# doesn't work
doit('zürich')
doit(u'zürich', 'utf-8')
# works
doit(u'zürich')
doit('zürich', 'utf-8')
## _u solves it by ensuring we have what we want
doit(_u('Zürich'))
doit(_u(u'Zürich'))
doit(_u8(u'Zürich'), 'utf-8')
doit(_u8('Zürich'), 'utf-8')
@c3333
Copy link
Author

c3333 commented Jul 22, 2020

lambdas zum Konvertieren in Unicode und Bytestring UTF8 von jedem Basisring (Unicode oder str)

Verwenden Sie wie folgt:

text = u'Zürich '

utext = 'Zürich'

print "Text% s% s"% _uu (Text, Utext)

print u "Text% s"% _u (Text)

print "Text {}, {}". Format (* _ uu8 (Text, Utext))

print u "Text {}, {}". Format (* _ uu (Text, Utext))

Unicode-Zeichenfolge garantieren

_u = Lambda t : t . decodiere ( 'UTF-8' , 'ersetzen' ) wenn isinstance ( t , str ) sonst t
_uu = Lambda * tt : Tupel ( _u ( t ) für t in tt )

Garantie Byte-String in UTF8-Codierung

_u8 = Lambda t : t . kodieren ( 'UTF-8' , 'ersetzen' ) , wenn isinstance ( t , Unicode ) sonst t
_uu8 = Lambda * tt : Tupel ( _u8 ( t ) für t in tt )

def main ():
text = 'Ein String mit Codes> 127, wie Zürich'
utext = u'Einiger String mit Codes> 127, wie Zürich '
print "==> with _u, _uu"
Drucken _u ( Text ), Typ ( _u ( text ))
print _u ( utext ), type ( _u ( utext ))
print _uu ( Text , Utext ), Typ ( _uu ( Text , Utext ))
print "==> mit u8, uu8"
Druck _u8 ( Text ), Typ ( _u8 ( text ))
print _u8 ( utext ), type ( _u8 ( utext ))
print _uu8 ( text , utext ), type ( _uu8 ( text , utext ))
# Verwenden Sie bei% -Formatierung immer _u () und _uu ()
print "Einige unbekannte Eingaben% s" % _u ( Text )
print "Mehrere Eingaben% s,% s" % _uu ( Text , Text )
# aber mit .format muss immer mit Unicode-Strings gearbeitet werden
print u "Funktioniert auch mit folgenden Formaten: {}" . Format ( _u ( Text ))
print u "Funktioniert auch mit folgenden Formaten: {}, {}" . Format ( * _uu ( Text , Text ))
# ... oder verwenden Sie _u8 und _uu8, da string.format Byte-Strings erwartet
print "Funktioniert auch mit Formaten: {}" . Format ( _u8 ( Text ))
print "Funktioniert auch mit folgenden Formaten: {}, {}" . Format ( * _uu8 ( Text , Text ))

if name == 'main' :

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment