toddysm/unicode.py

## unicode.py
# reset the default encoding from ASCII to Unicode
# Note: it may not always work (depends on installations)
# see: http://blog.ianbicking.org/illusive-setdefaultencoding.html
import sys
reload(sys) # else the method will be missing
sys.setdefaultencoding('utf-8')

# convert from <type 'str'> to <type 'unicode'>
text.decode('utf-8')

# convert from <type 'unicode'> to <type 'str'>
text.encode('ascii')
text.encode('ascii', 'ignore')    # removes the unicode chars
text.encode('ascii', 'replace')   # replaces with ?
text.encode('ascii','xmlcharrefreplace') # turn into xml entities
text.encode('ascii', 'strict')    # throw UnicodeEncodeErrors

# for files use codecs module
import codecs
f = codecs.open('myfile.txt', 'r', encoding='utf-8')

# function that converts the text to Unicode
def to_unicode(text, encoding='utf-8'):
  if isinstance(text, basestring):
    if not isinstance(text, unicode):
      text = unicode(text, encoding)

  return text

# detecting the BOM
import codecs
text.startswith(codecs.BOM_UTF8)
text.startswith(codecs.BOM_UTF16)
text.startswith(codecs.BOM_UTF16_LE)
text.startswith(codecs.BOM_UTF16_BE)
text.startswith(codecs.BOM_UTF32)
text.startswith(codecs.BOM_UTF32_LE)
text.startswith(codecs.BOM_UTF32_BE)
	# reset the default encoding from ASCII to Unicode
	# Note: it may not always work (depends on installations)
	# see: http://blog.ianbicking.org/illusive-setdefaultencoding.html
	import sys
	reload(sys) # else the method will be missing
	sys.setdefaultencoding('utf-8')

	# convert from <type 'str'> to <type 'unicode'>
	text.decode('utf-8')

	# convert from <type 'unicode'> to <type 'str'>
	text.encode('ascii')
	text.encode('ascii', 'ignore') # removes the unicode chars
	text.encode('ascii', 'replace') # replaces with ?
	text.encode('ascii','xmlcharrefreplace') # turn into xml entities
	text.encode('ascii', 'strict') # throw UnicodeEncodeErrors

	# for files use codecs module
	import codecs
	f = codecs.open('myfile.txt', 'r', encoding='utf-8')

	# function that converts the text to Unicode
	def to_unicode(text, encoding='utf-8'):
	if isinstance(text, basestring):
	if not isinstance(text, unicode):
	text = unicode(text, encoding)

	return text

	# detecting the BOM
	import codecs
	text.startswith(codecs.BOM_UTF8)
	text.startswith(codecs.BOM_UTF16)
	text.startswith(codecs.BOM_UTF16_LE)
	text.startswith(codecs.BOM_UTF16_BE)
	text.startswith(codecs.BOM_UTF32)
	text.startswith(codecs.BOM_UTF32_LE)
	text.startswith(codecs.BOM_UTF32_BE)