leebird/python2_unicode_string.py

## python2_unicode_string.py
# -*- coding: utf-8 -*-
# Test various characteristics of Unicode string in Python 2.
# In Python 2, we have 2 types to store string data, str and unicode.
# The type str is like a byte string, while the type unicode stores
# unicode codepoints, with each being represented by one or more bytes.

# Define a simple ASCII string, the type is str.
ascii_a = 'abcdefg'
print 'OUTPUT 1'
print type(ascii_a)
print

# Define a unicode string using u literal, the type is unicode.
unicode_a = u'abcdefg'
print 'OUTPUT 2'
print type(unicode_a)
print

# Python does implicit type coversion in comparison.
print 'OUTPUT 3'
print ascii_a == unicode_a
print

# Methods converting between str (bytes) and unicode types.
# Convert unicode to str. We need to specify the encoding we want
# to use to encode the unicode characters down to a list of bytes,
# which is a str in Python. Here we use utf8, and as 'abcdefg'
# are in the set of characters encoded by utf8, this will be fine.
print 'OUTPUT 4'
print unicode_a.encode('utf8')
print type(unicode_a.encode('utf8'))
print

# Convert str to unicode. We need to specify the encoding that
# the current bytes (str) are used. We use the encoding as a
# mapping from the raw bytes to a single unicode character.
# Sometimes, multiple bytes are mapped to one unicode character,
# this is where we need to encoding. As each byte in 'abcdefg' is
# mapped to one unicode character, this is fine.
print 'OUTPUT 5'
print ascii_a.decode('utf8')
print type(ascii_a.decode('utf8'))
print

# Now we move to some more complex cases.
# Here I define a string with 2 Chinese characters. If we store them
# using a str type, then its length will be 6 instead of 2, as each
# Chinese character is represented by 3 bytes. In the disk, these two
# characters are stored exactly using 6 bytes. The editor knows this
# file is encoded in utf-8 (first line), so it decodes the 6 bytes
# into two Chinese characters and displays them correctly. But at this
# point, Python won't decode these 6 bytes into 2 unicode Chinese
# characters. Python just uses the encoding to do lexical analysis.
# See https://docs.python.org/3/reference/lexical_analysis.html for
# more details.
bytes_b = '你好'

# Type will be str, length will still be 6. Python doesn't decode
# them into two unicode chracters.
print 'OUTPUT 6'
print type(bytes_b), len(bytes_b)
print

# Let's print the first 3 bytes for the first character. They will
# be printed as \ooo as ooo is the octal value.
print 'OUTPUT 7'
print bytes_b[0], bytes_b[3], bytes_b[2]
print

# We can explicitly decode the bytes into 2 unicode characters.
unicode_b = bytes_b.decode('utf8')

# Now, the type is unicode, and the length is 2.
print 'OUTPUT 8'
print type(unicode_b), len(unicode_b)
print

# Let's print the 2 unicode characters. (Some details are hidden here,
# i.e., what happens during the printing? Why we can print both str
# and unicode characters correctly? Who is doing the underlying work?)
print 'OUTPUT 9'
print unicode_b[0], unicode_b[1]
print

# Now, we try to decode the bytes using ascii encoding, and it will fail!
# because ascii ranges from 0 to 127, and the bytes are larger than 127,
# which will cause an UnicodeDecodeError.
print 'OUTPUT 10'
try:
    bytes_b.decode('ascii')
except UnicodeDecodeError as e:
    print e
print

# Similarly, if we try to use ascii to encode the 2 unicode characters,
# it will also fail, resulting in a UnicodeEncodeError.
print 'OUTPUT 11'
try:
    unicode_b.encode('ascii')
except UnicodeEncodeError as e:
    print e
print

# We can encode the unicode strings back to bytes using utf8.
print 'OUTPUT 12'
print unicode_b.encode('utf8')
# They should be the same (containing exactly same 6 bytes).
print bytes_b == unicode_b.encode('utf8')
print

# OK, this is the last part. Escaping of the unicode charaters. By escaping
# I mean using '\ooo', '\xhh' or '\uxxxx' to represent characters in the
# string literal. They are NOT single-byte value on disk! '\ooo', '\xhh'
# will literally take 4 bytes each, and '\uxxxx' will take 6 bytes. So they
# are a higer level abstraction than encoding. Encoding is defined as how
# we convert between raw byte stream and string, but escaping is defined
# as how we use string (normally ascii characters) to represent a character
# (normally unicode characters). The values (ooo in octal, hh and xxxx in
# hex) are the absolute value! For example, \xx represent a 8-bit number,
# and \uxxxx represent a 16-bit number. The escaping/decoding function will
# translate these numbers to the actual character, if possible.
print 'OUTPUT 13'
# The decoding function hidden behind the literal 'u' knows 1) this should
# be a unicode string, 2) the 16-bit value then should be translated into
# the actual unicode character. How is this different from '你' or u'你'?
# '你' will be just 3 bytes, and u'你' will be translated into unicode
# character from the 3 bytes, instead of from the escaped \uxxxx value.
escaping_a = u'\u4f60'
print escaping_a, type(escaping_a), len(escaping_a)
print

# This will be just a simple byte string, without the 'u' literal.
print 'OUTPUT 14'
escaping_b = '\u4f60'
print escaping_b, type(escaping_b), len(escaping_b)
print

# We can also convert from escaped string to unicode characters.
print 'OUTPUT 15'
unicode_c = escaping_b.decode('unicode_escape')
print unicode_c, type(unicode_c), len(unicode_c)
print

# How about this?
print 'OUTPUT 16'
unicode_d = escaping_b.decode('ascii')
# It's a unicode string now.
print unicode_d, type(unicode_d), len(unicode_d)
# Decode the escaped string.
unicode_e = unicode_d.decode('unicode_escape')
# Now you see why escaping is a higher abstraction than encoding.
print unicode_e, type(unicode_e), len(unicode_e)
print

# One more thing.
print 'OUTPUT 17'
escaping_c = '\\u4f60'
print escaping_c, type(escaping_c), len(escaping_c)
# Why '\\u4f60' has the same length with '\u4f60'? They are the same!
print '\\u4f60' == '\u4f60'
# The reason is '\\' is eqaul to '\' when Python loads the them into memory.
# The escaping by '\u' happens after loading, at the string literal level!
# It happens like a second pass over the loaded string, either in bytes or
# unicode.
print

# How about this? It will fail and result in UnicodeEncodeError. Because
# the unicode character has 3 bytes, where the decode() function will
# operate on. And 'unicode_escape' assumes that they are ascii characters!
# So in essense, escaping means using ascii characters to represent unicode
# characters. When 'unicode_escape' does the converting, it first knows it's
# a unicode string, and then it tries to convert them to bytes first, using
# ascii as encoding.
print 'OUTPUT 18'
try:
    u'\u4f60'.decode('unicode_escape')
except UnicodeEncodeError as e:
    print e
print
	# -- coding: utf-8 --
	# Test various characteristics of Unicode string in Python 2.
	# In Python 2, we have 2 types to store string data, str and unicode.
	# The type str is like a byte string, while the type unicode stores
	# unicode codepoints, with each being represented by one or more bytes.

	# Define a simple ASCII string, the type is str.
	ascii_a = 'abcdefg'
	print 'OUTPUT 1'
	print type(ascii_a)
	print

	# Define a unicode string using u literal, the type is unicode.
	unicode_a = u'abcdefg'
	print 'OUTPUT 2'
	print type(unicode_a)
	print

	# Python does implicit type coversion in comparison.
	print 'OUTPUT 3'
	print ascii_a == unicode_a
	print

	# Methods converting between str (bytes) and unicode types.
	# Convert unicode to str. We need to specify the encoding we want
	# to use to encode the unicode characters down to a list of bytes,
	# which is a str in Python. Here we use utf8, and as 'abcdefg'
	# are in the set of characters encoded by utf8, this will be fine.
	print 'OUTPUT 4'
	print unicode_a.encode('utf8')
	print type(unicode_a.encode('utf8'))
	print

	# Convert str to unicode. We need to specify the encoding that
	# the current bytes (str) are used. We use the encoding as a
	# mapping from the raw bytes to a single unicode character.
	# Sometimes, multiple bytes are mapped to one unicode character,
	# this is where we need to encoding. As each byte in 'abcdefg' is
	# mapped to one unicode character, this is fine.
	print 'OUTPUT 5'
	print ascii_a.decode('utf8')
	print type(ascii_a.decode('utf8'))
	print

	# Now we move to some more complex cases.
	# Here I define a string with 2 Chinese characters. If we store them
	# using a str type, then its length will be 6 instead of 2, as each
	# Chinese character is represented by 3 bytes. In the disk, these two
	# characters are stored exactly using 6 bytes. The editor knows this
	# file is encoded in utf-8 (first line), so it decodes the 6 bytes
	# into two Chinese characters and displays them correctly. But at this
	# point, Python won't decode these 6 bytes into 2 unicode Chinese
	# characters. Python just uses the encoding to do lexical analysis.
	# See https://docs.python.org/3/reference/lexical_analysis.html for
	# more details.
	bytes_b = '你好'

	# Type will be str, length will still be 6. Python doesn't decode
	# them into two unicode chracters.
	print 'OUTPUT 6'
	print type(bytes_b), len(bytes_b)
	print

	# Let's print the first 3 bytes for the first character. They will
	# be printed as \ooo as ooo is the octal value.
	print 'OUTPUT 7'
	print bytes_b[0], bytes_b[3], bytes_b[2]
	print

	# We can explicitly decode the bytes into 2 unicode characters.
	unicode_b = bytes_b.decode('utf8')

	# Now, the type is unicode, and the length is 2.
	print 'OUTPUT 8'
	print type(unicode_b), len(unicode_b)
	print

	# Let's print the 2 unicode characters. (Some details are hidden here,
	# i.e., what happens during the printing? Why we can print both str
	# and unicode characters correctly? Who is doing the underlying work?)
	print 'OUTPUT 9'
	print unicode_b[0], unicode_b[1]
	print

	# Now, we try to decode the bytes using ascii encoding, and it will fail!
	# because ascii ranges from 0 to 127, and the bytes are larger than 127,
	# which will cause an UnicodeDecodeError.
	print 'OUTPUT 10'
	try:
	bytes_b.decode('ascii')
	except UnicodeDecodeError as e:
	print e
	print

	# Similarly, if we try to use ascii to encode the 2 unicode characters,
	# it will also fail, resulting in a UnicodeEncodeError.
	print 'OUTPUT 11'
	try:
	unicode_b.encode('ascii')
	except UnicodeEncodeError as e:
	print e
	print

	# We can encode the unicode strings back to bytes using utf8.
	print 'OUTPUT 12'
	print unicode_b.encode('utf8')
	# They should be the same (containing exactly same 6 bytes).
	print bytes_b == unicode_b.encode('utf8')
	print

	# OK, this is the last part. Escaping of the unicode charaters. By escaping
	# I mean using '\ooo', '\xhh' or '\uxxxx' to represent characters in the
	# string literal. They are NOT single-byte value on disk! '\ooo', '\xhh'
	# will literally take 4 bytes each, and '\uxxxx' will take 6 bytes. So they
	# are a higer level abstraction than encoding. Encoding is defined as how
	# we convert between raw byte stream and string, but escaping is defined
	# as how we use string (normally ascii characters) to represent a character
	# (normally unicode characters). The values (ooo in octal, hh and xxxx in
	# hex) are the absolute value! For example, \xx represent a 8-bit number,
	# and \uxxxx represent a 16-bit number. The escaping/decoding function will
	# translate these numbers to the actual character, if possible.
	print 'OUTPUT 13'
	# The decoding function hidden behind the literal 'u' knows 1) this should
	# be a unicode string, 2) the 16-bit value then should be translated into
	# the actual unicode character. How is this different from '你' or u'你'?
	# '你' will be just 3 bytes, and u'你' will be translated into unicode
	# character from the 3 bytes, instead of from the escaped \uxxxx value.
	escaping_a = u'\u4f60'
	print escaping_a, type(escaping_a), len(escaping_a)
	print

	# This will be just a simple byte string, without the 'u' literal.
	print 'OUTPUT 14'
	escaping_b = '\u4f60'
	print escaping_b, type(escaping_b), len(escaping_b)
	print

	# We can also convert from escaped string to unicode characters.
	print 'OUTPUT 15'
	unicode_c = escaping_b.decode('unicode_escape')
	print unicode_c, type(unicode_c), len(unicode_c)
	print

	# How about this?
	print 'OUTPUT 16'
	unicode_d = escaping_b.decode('ascii')
	# It's a unicode string now.
	print unicode_d, type(unicode_d), len(unicode_d)
	# Decode the escaped string.
	unicode_e = unicode_d.decode('unicode_escape')
	# Now you see why escaping is a higher abstraction than encoding.
	print unicode_e, type(unicode_e), len(unicode_e)
	print

	# One more thing.
	print 'OUTPUT 17'
	escaping_c = '\\u4f60'
	print escaping_c, type(escaping_c), len(escaping_c)
	# Why '\\u4f60' has the same length with '\u4f60'? They are the same!
	print '\\u4f60' == '\u4f60'
	# The reason is '\\' is eqaul to '\' when Python loads the them into memory.
	# The escaping by '\u' happens after loading, at the string literal level!
	# It happens like a second pass over the loaded string, either in bytes or
	# unicode.
	print

	# How about this? It will fail and result in UnicodeEncodeError. Because
	# the unicode character has 3 bytes, where the decode() function will
	# operate on. And 'unicode_escape' assumes that they are ascii characters!
	# So in essense, escaping means using ascii characters to represent unicode
	# characters. When 'unicode_escape' does the converting, it first knows it's
	# a unicode string, and then it tries to convert them to bytes first, using
	# ascii as encoding.
	print 'OUTPUT 18'
	try:
	u'\u4f60'.decode('unicode_escape')
	except UnicodeEncodeError as e:
	print e
	print