fnielsen/encoding.py

## encoding.py
# python2.6

'Finn Årup Nielsen'

u'Finn Årup Nielsen'

unicode('Finn Årup Nielsen', 'utf-8')


# So what is wrong with that?}

len(u'Finn Arup Nielsen')

len('Finn Årup Nielsen')

len(u'Finn Årup Nielsen')


# Another example of a problem: Finding words with regular expressions:

import re
re.findall('\w+', 'Finn Årup Nielsen')
# ['Finn', 'rup', 'Nielsen']

re.findall('\w+', u'Finn Årup Nielsen', re.UNICODE)
# [u'Finn', u'\xc5rup', u'Nielsen']


# Encoding in Python 3

'Finn Årup Nielsen'

u'Finn Årup Nielsen'


# Encoding from Unicode

u'Rådvad Æblerød'.encode('utf-8')
# 'R\xc3\xa5dvad \xc3\x86bler\xc3\xb8d'

u'Rådvad Æblerød'.encode('ascii')
# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe5' ...

u'Rådvad Æblerød'.encode('ascii', 'ignore')
# 'Rdvad blerd'

u'Rådvad Æblerød'.encode('ascii', 'replace')
# 'R?dvad ?bler?d'

u'Rådvad Æblerød'.encode('ascii', 'xmlcharrefreplace')
# 'R&#229;dvad &#198;bler&#248;d'


# Files
f = open('text-with-utf-8.txt', 'wb')
f.write('R\xc3\xa5dvad \xc3\x86belr\xc3\xb8d')
f.close()

# $ hexdump -C text-with-utf-8.txt
# 00000000  52 c3 a5 64 76 61 64 20  c3 86 62 65 6c 72 c3 b8  |R..dvad ..belr..|
# 00000010  64                                                |d|


# File I/O with Python 2

print(len(open('text-with-utf-8.txt').read()))

print(len(unicode(open('text-with-utf-8.txt').read(), 'utf-8')))

import codecs
print(len(codecs.open('text-with-utf-8.txt', encoding='utf-8').read()))


# File I/O with Python 2 default encoding

print(len(open('text-with-utf-8.txt').read()))

print(len(unicode(open('text-with-utf-8.txt').read())))
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in ...


# Use ``sys reload''-trick:

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print(len(open('text-with-utf-8.txt').read()))

print(len(unicode(open('text-with-utf-8.txt').read())))


# Python 3

# Python 3 reading with UTF-8 environment
# $ LANG=en_US.utf8 ; python3

print(len(open('text-with-utf-8.txt').read()))

print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


# Python 3 reading with non-UTF-8 environment
# $ LANG=C; python3

print(len(open('text-with-utf-8.txt').read()))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 ...

print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


for enc in ['ascii', 'ISO8859-1', 'latin1', 'utf-8']:
  try:
    s = open('text-with-utf-8.txt', encoding=enc).read()
  except UnicodeDecodeError:
    continue
  print("Read with encoding =", enc)
  break

print(s)
# gives you a UnicodeEncodeError

print(s.encode('ascii', 'replace'))
# b'R??dvad ??belr??d'

# Encoding in source code

# Python 2 script with UTF-8 encoding

#!/usr/bin/python2.6
# -*- coding: utf-8 -*-
print("Rådvad Knivfabrik")

# Python3
Æ = 3
A = 1
A + Æ

"""
http://docs.python.org/howto/unicode.html
Unicode HOWTO in Python documentation

http://diveintopython.org/xml_processing/unicode.html

http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html

Kumar McMillan's talk
http://farmdev.com/talks/unicode/
Unicode In Python, Completely Demystified} from PyCon 2008.
"""
	# python2.6

	'Finn Årup Nielsen'

	u'Finn Årup Nielsen'

	unicode('Finn Årup Nielsen', 'utf-8')


	# So what is wrong with that?}

	len(u'Finn Arup Nielsen')

	len('Finn Årup Nielsen')

	len(u'Finn Årup Nielsen')


	# Another example of a problem: Finding words with regular expressions:

	import re
	re.findall('\w+', 'Finn Årup Nielsen')
	# ['Finn', 'rup', 'Nielsen']

	re.findall('\w+', u'Finn Årup Nielsen', re.UNICODE)
	# [u'Finn', u'\xc5rup', u'Nielsen']


	# Encoding in Python 3

	'Finn Årup Nielsen'

	u'Finn Årup Nielsen'


	# Encoding from Unicode

	u'Rådvad Æblerød'.encode('utf-8')
	# 'R\xc3\xa5dvad \xc3\x86bler\xc3\xb8d'

	u'Rådvad Æblerød'.encode('ascii')
	# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe5' ...

	u'Rådvad Æblerød'.encode('ascii', 'ignore')
	# 'Rdvad blerd'

	u'Rådvad Æblerød'.encode('ascii', 'replace')
	# 'R?dvad ?bler?d'

	u'Rådvad Æblerød'.encode('ascii', 'xmlcharrefreplace')
	# 'Rådvad Æblerød'


	# Files
	f = open('text-with-utf-8.txt', 'wb')
	f.write('R\xc3\xa5dvad \xc3\x86belr\xc3\xb8d')
	f.close()

	# $ hexdump -C text-with-utf-8.txt
	# 00000000 52 c3 a5 64 76 61 64 20 c3 86 62 65 6c 72 c3 b8 \|R..dvad ..belr..\|
	# 00000010 64 \|d\|


	# File I/O with Python 2

	print(len(open('text-with-utf-8.txt').read()))

	print(len(unicode(open('text-with-utf-8.txt').read(), 'utf-8')))

	import codecs
	print(len(codecs.open('text-with-utf-8.txt', encoding='utf-8').read()))



	# File I/O with Python 2 default encoding

	print(len(open('text-with-utf-8.txt').read()))

	print(len(unicode(open('text-with-utf-8.txt').read())))
	# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in ...


	# Use ``sys reload''-trick:

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')
	print(len(open('text-with-utf-8.txt').read()))

	print(len(unicode(open('text-with-utf-8.txt').read())))



	# Python 3

	# Python 3 reading with UTF-8 environment
	# $ LANG=en_US.utf8 ; python3

	print(len(open('text-with-utf-8.txt').read()))

	print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


	# Python 3 reading with non-UTF-8 environment
	# $ LANG=C; python3

	print(len(open('text-with-utf-8.txt').read()))
	UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 ...

	print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


	for enc in ['ascii', 'ISO8859-1', 'latin1', 'utf-8']:
	try:
	s = open('text-with-utf-8.txt', encoding=enc).read()
	except UnicodeDecodeError:
	continue
	print("Read with encoding =", enc)
	break

	print(s)
	# gives you a UnicodeEncodeError

	print(s.encode('ascii', 'replace'))
	# b'R??dvad ??belr??d'

	# Encoding in source code

	# Python 2 script with UTF-8 encoding

	#!/usr/bin/python2.6
	# -- coding: utf-8 --
	print("Rådvad Knivfabrik")

	# Python3
	Æ = 3
	A = 1
	A + Æ

	"""
	http://docs.python.org/howto/unicode.html
	Unicode HOWTO in Python documentation

	http://diveintopython.org/xml_processing/unicode.html

	http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html

	Kumar McMillan's talk
	http://farmdev.com/talks/unicode/
	Unicode In Python, Completely Demystified} from PyCon 2008.
	"""