Last active
December 14, 2015 22:29
-
-
Save toddysm/5158717 to your computer and use it in GitHub Desktop.
Different ways to deal with Unicode strings in Python 2.x
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# reset the default encoding from ASCII to Unicode | |
# Note: it may not always work (depends on installations) | |
# see: http://blog.ianbicking.org/illusive-setdefaultencoding.html | |
import sys | |
reload(sys) # else the method will be missing | |
sys.setdefaultencoding('utf-8') | |
# convert from <type 'str'> to <type 'unicode'> | |
text.decode('utf-8') | |
# convert from <type 'unicode'> to <type 'str'> | |
text.encode('ascii') | |
text.encode('ascii', 'ignore') # removes the unicode chars | |
text.encode('ascii', 'replace') # replaces with ? | |
text.encode('ascii','xmlcharrefreplace') # turn into xml entities | |
text.encode('ascii', 'strict') # throw UnicodeEncodeErrors | |
# for files use codecs module | |
import codecs | |
f = codecs.open('myfile.txt', 'r', encoding='utf-8') | |
# function that converts the text to Unicode | |
def to_unicode(text, encoding='utf-8'): | |
if isinstance(text, basestring): | |
if not isinstance(text, unicode): | |
text = unicode(text, encoding) | |
return text | |
# detecting the BOM | |
import codecs | |
text.startswith(codecs.BOM_UTF8) | |
text.startswith(codecs.BOM_UTF16) | |
text.startswith(codecs.BOM_UTF16_LE) | |
text.startswith(codecs.BOM_UTF16_BE) | |
text.startswith(codecs.BOM_UTF32) | |
text.startswith(codecs.BOM_UTF32_LE) | |
text.startswith(codecs.BOM_UTF32_BE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment