-
-
Save c3333/63b1683fd04e1149bc1b715e32354c30 to your computer and use it in GitHub Desktop.
Tired of Python's UnicodeDeocodeError, ascii codec can't decode? Here's how to fix it, once and for all.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python ascii codec can't decode and unicode mess | |
# | |
# check this out https://pythonhosted.org/kitchen/unicode-frustrations.html | |
# and this http://www.joelonsoftware.com/articles/Unicode.html | |
# | |
# The short of it is this | |
# 1. If you can, always set PYTHONIOENCODING=utf8 before you start your python programs. | |
# 2. If you can't or you can't ensure this, always use the following lambda _u to get unicode text | |
# whereever you convert to strings (str.format, str % etc.) | |
# | |
# So for 2. you always do this: | |
# _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t | |
# _uu = lambda *tt: tuple(_u(t) for t in tt) | |
# # use like this | |
# text='Some string with codes > 127, like Zürich' | |
# print "Some unknown input %s" % _u(text) | |
# print "Multiple inputs %s, %s" % _uu(text, text) | |
# # or like this | |
# print u"Some string with codecs > 127 {}".format(_u(text)) | |
# print u"Multiple inputs {}, {}".format(_uu(text, text)) | |
# --- | |
# amazingly, you can assign UTF8 to strings, it works nicely | |
print "==> We'll work with these strings" | |
text = "Zürich" | |
utext = u"Zürich" | |
print text, type(text) | |
print utext, type(utext) | |
# will raise UnicodeDeocdeError | |
try: | |
print "==> Try as unicode(text) => error" | |
print unicode(text) | |
except UnicodeDecodeError as e: | |
print e | |
# try again | |
print "==> Try as with replacing unknown characters" | |
print unicode(text, errors="replace") | |
# again | |
print "==> Try as with ignoring unknown characters" | |
print unicode(text, errors='ignore') | |
# again | |
print "==> Try as with UTF-8 encoding" | |
print unicode(text, encoding="UTF-8") | |
# but this doesn't work!! | |
try: | |
print "But amazingly you can't encode unicode ext into UTF8" | |
print unicode(utext, encoding="UTF-8") | |
except Exception as e: | |
print "because %s" %e | |
# be smart | |
print "==> So you want unicode in string formats, huh?" | |
try: | |
print u"%s" % text | |
except UnicodeDecodeError as e: | |
print "Not so fast! %s" % e | |
# let's try that again | |
print "==> Now for real using string formats" | |
print u"%s" % unicode(text, 'UTF-8') | |
# so you think that's taking all the fun out of working with string formatting? | |
print "==> Doing it like this works more naturally" | |
print unicode("%s" % text, 'UTF-8') | |
try: | |
print "==> except if the string is already unicode:" | |
print unicode("%s" % utext, 'UTF-8') | |
except Exception as e: | |
print "sorry, no can do, %s" % e | |
# can we join, slice etc. such strings? | |
otext = "ü".join(text) | |
print otext | |
# even better, use export PYTHONIOENCODING=utf8, if you can at start up time | |
# see http://stackoverflow.com/a/27066059/890242 | |
#--------- | |
# let's deal with unicode itself | |
print "===> Unicode text can just be printed like that, because they are of type unicode" | |
print "%s" % utext, type("%s" % utext) | |
# but if you combine the two, the fun starts | |
print "===> Unicode + byte text cannot just be printed like that, because they are of type unicode" | |
try: | |
print "%s == %s" % (utext, text), type("%s" % utext) | |
except UnicodeDecodeError as e: | |
print "because, well, %s" % e | |
# try again | |
print "===> But you can covert them all to unicode or UTF8" | |
print unicode("%s == %s" % (utext, unicode(text, 'UTF-8'))) | |
print "%s == %s" % (utext, unicode(text, 'UTF-8')) | |
try: | |
print "%s == %s" % (utext.decode('UTF-8'), text.decode('UTF-8')) | |
except UnicodeDecodeError as e: | |
print "no so fast... %s" % e | |
except Exception as e: | |
print "no so fast... %s" % e | |
# again | |
print "===> byte strings with codes > 127 are harder. You have to DECODE to utf8" | |
print text.decode('UTF-8') | |
try: | |
print text.encode('UTF-8') | |
except Exception as e: | |
print "because encode won't work, %s" % e | |
print "===> while you have to ENCODE unicodes. You can't decode unicode into UTF-8." | |
print "works:", utext.encode('UTF-8') | |
try: | |
print "doesn't:", utext.decode('UTF-8') | |
except Exception as e: | |
print "%s" % e | |
# in summary, if you know you want | |
print "==> So in summary, always use unicode(text, 'UTF-8') to ensure you have unicode text from byte code" | |
print unicode(text, 'UTF-8') | |
print "==> or save-get a unicode object from your inputs, whether unicode or byte string" | |
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t | |
#else unicode(text.encode('UTF-8', 'replace')) | |
print _u(text), type(_u(text)) | |
print _u(utext), type(_u(utext)) | |
print "==> then you can mix match safely" | |
text = _u(text) | |
print "%s == %s" % (text, utext) | |
print u"%s == %s" % (text, utext) | |
# Conclusion: Always use this: | |
# guarantee unicode | |
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t | |
_uu = lambda *tt: tuple(_u(t) for t in tt) | |
# guarantee byte string in UTF8 | |
_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t | |
_uu8 = lambda *tt: tuple(_u8(t) for t in tt) | |
text='Some string with codes > 127, like Zürich' | |
print "==> with _u, _uu" | |
print _u(text), type(_u(text)) | |
print _u(utext), type(_u(utext)) | |
print _uu(text, utext), type(_uu(text, utext)) | |
print "==> with u8, uu8" | |
print _u8(text), type(_u8(text)) | |
print _u8(utext), type(_u8(utext)) | |
print _uu8(text, utext), type(_uu8(text, utext)) | |
# with % formatting, always use _u() and _uu() | |
print "Some unknown input %s" % _u(text) | |
print "Multiple inputs %s, %s" % _uu(text, text) | |
# but with .format be sure to always work with unicode strings | |
print u"Also works with formats: {}".format(_u(text)) | |
print u"Also works with formats: {},{}".format(*_uu(text, text)) | |
# ... or use _u8 and _uu8, because string.format expects byte strings | |
print "Also works with formats: {}".format(_u8(text)) | |
print "Also works with formats: {},{}".format(*_uu8(text, text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# lambdas to convert into unicode and bytestring UTF8 from any basestring (unicode or str) | |
# | |
# Use as follows: | |
# text = u'Zürich' | |
# utext = 'Zürich' | |
# print "Text %s %s" % _uu(text, utext) | |
# print u"Text %s" % _u(text) | |
# print "Text {},{}".format(*_uu8(text, utext)) | |
# print u"Text {},{}".format(*_uu(text, utext)) | |
# | |
# | |
# guarantee unicode string | |
_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t | |
_uu = lambda *tt: tuple(_u(t) for t in tt) | |
# guarantee byte string in UTF8 encoding | |
_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t | |
_uu8 = lambda *tt: tuple(_u8(t) for t in tt) | |
def main(): | |
text='Some string with codes > 127, like Zürich' | |
utext=u'Some string with codes > 127, like Zürich' | |
print "==> with _u, _uu" | |
print _u(text), type(_u(text)) | |
print _u(utext), type(_u(utext)) | |
print _uu(text, utext), type(_uu(text, utext)) | |
print "==> with u8, uu8" | |
print _u8(text), type(_u8(text)) | |
print _u8(utext), type(_u8(utext)) | |
print _uu8(text, utext), type(_uu8(text, utext)) | |
# with % formatting, always use _u() and _uu() | |
print "Some unknown input %s" % _u(text) | |
print "Multiple inputs %s, %s" % _uu(text, text) | |
# but with .format be sure to always work with unicode strings | |
print u"Also works with formats: {}".format(_u(text)) | |
print u"Also works with formats: {},{}".format(*_uu(text, text)) | |
# ... or use _u8 and _uu8, because string.format expects byte strings | |
print "Also works with formats: {}".format(_u8(text)) | |
print "Also works with formats: {},{}".format(*_uu8(text, text)) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# that's the actual mess with unicode() in Python. It only works on either bytestring or unicode input | |
# and you can't actually be sure what you get every single time | |
def doit(*args): | |
print "trying", args | |
text = args[0] | |
ttype = type(text) | |
try: | |
text = unicode(*args) | |
except: | |
print "==> DID NOT WORK: %s (type %s)" % (text, ttype) | |
else: | |
print "==> %s worked (type %s)" % (text, ttype) | |
# we want to have a uniform way to call unicode() for either byte string or unicode | |
# doesn't work | |
doit('zürich') | |
doit(u'zürich', 'utf-8') | |
# works | |
doit(u'zürich') | |
doit('zürich', 'utf-8') | |
## _u solves it by ensuring we have what we want | |
doit(_u('Zürich')) | |
doit(_u(u'Zürich')) | |
doit(_u8(u'Zürich'), 'utf-8') | |
doit(_u8('Zürich'), 'utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
lambdas zum Konvertieren in Unicode und Bytestring UTF8 von jedem Basisring (Unicode oder str)
Verwenden Sie wie folgt:
text = u'Zürich '
utext = 'Zürich'
print "Text% s% s"% _uu (Text, Utext)
print u "Text% s"% _u (Text)
print "Text {}, {}". Format (* _ uu8 (Text, Utext))
print u "Text {}, {}". Format (* _ uu (Text, Utext))
Unicode-Zeichenfolge garantieren
_u = Lambda t : t . decodiere ( 'UTF-8' , 'ersetzen' ) wenn isinstance ( t , str ) sonst t
_uu = Lambda * tt : Tupel ( _u ( t ) für t in tt )
Garantie Byte-String in UTF8-Codierung
_u8 = Lambda t : t . kodieren ( 'UTF-8' , 'ersetzen' ) , wenn isinstance ( t , Unicode ) sonst t
_uu8 = Lambda * tt : Tupel ( _u8 ( t ) für t in tt )
def main ():
text = 'Ein String mit Codes> 127, wie Zürich'
utext = u'Einiger String mit Codes> 127, wie Zürich '
print "==> with _u, _uu"
Drucken _u ( Text ), Typ ( _u ( text ))
print _u ( utext ), type ( _u ( utext ))
print _uu ( Text , Utext ), Typ ( _uu ( Text , Utext ))
print "==> mit u8, uu8"
Druck _u8 ( Text ), Typ ( _u8 ( text ))
print _u8 ( utext ), type ( _u8 ( utext ))
print _uu8 ( text , utext ), type ( _uu8 ( text , utext ))
# Verwenden Sie bei% -Formatierung immer _u () und _uu ()
print "Einige unbekannte Eingaben% s" % _u ( Text )
print "Mehrere Eingaben% s,% s" % _uu ( Text , Text )
# aber mit .format muss immer mit Unicode-Strings gearbeitet werden
print u "Funktioniert auch mit folgenden Formaten: {}" . Format ( _u ( Text ))
print u "Funktioniert auch mit folgenden Formaten: {}, {}" . Format ( * _uu ( Text , Text ))
# ... oder verwenden Sie _u8 und _uu8, da string.format Byte-Strings erwartet
print "Funktioniert auch mit Formaten: {}" . Format ( _u8 ( Text ))
print "Funktioniert auch mit folgenden Formaten: {}, {}" . Format ( * _uu8 ( Text , Text ))
if name == 'main' :