Created
October 8, 2016 00:49
-
-
Save leebird/84afde1649be8c47a723f54103159ad0 to your computer and use it in GitHub Desktop.
Test Python 2 str and unicode types
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Test various characteristics of Unicode string in Python 2. | |
# In Python 2, we have 2 types to store string data, str and unicode. | |
# The type str is like a byte string, while the type unicode stores | |
# unicode codepoints, with each being represented by one or more bytes. | |
# Define a simple ASCII string, the type is str. | |
ascii_a = 'abcdefg' | |
print 'OUTPUT 1' | |
print type(ascii_a) | |
# Define a unicode string using u literal, the type is unicode. | |
unicode_a = u'abcdefg' | |
print 'OUTPUT 2' | |
print type(unicode_a) | |
# Python does implicit type coversion in comparison. | |
print 'OUTPUT 3' | |
print ascii_a == unicode_a | |
# Methods converting between str (bytes) and unicode types. | |
# Convert unicode to str. We need to specify the encoding we want | |
# to use to encode the unicode characters down to a list of bytes, | |
# which is a str in Python. Here we use utf8, and as 'abcdefg' | |
# are in the set of characters encoded by utf8, this will be fine. | |
print 'OUTPUT 4' | |
print unicode_a.encode('utf8') | |
print type(unicode_a.encode('utf8')) | |
# Convert str to unicode. We need to specify the encoding that | |
# the current bytes (str) are used. We use the encoding as a | |
# mapping from the raw bytes to a single unicode character. | |
# Sometimes, multiple bytes are mapped to one unicode character, | |
# this is where we need to encoding. As each byte in 'abcdefg' is | |
# mapped to one unicode character, this is fine. | |
print 'OUTPUT 5' | |
print ascii_a.decode('utf8') | |
print type(ascii_a.decode('utf8')) | |
# Now we move to some more complex cases. | |
# Here I define a string with 2 Chinese characters. If we store them | |
# using a str type, then its length will be 6 instead of 2, as each | |
# Chinese character is represented by 3 bytes. In the disk, these two | |
# characters are stored exactly using 6 bytes. The editor knows this | |
# file is encoded in utf-8 (first line), so it decodes the 6 bytes | |
# into two Chinese characters and displays them correctly. But at this | |
# point, Python won't decode these 6 bytes into 2 unicode Chinese | |
# characters. Python just uses the encoding to do lexical analysis. | |
# See https://docs.python.org/3/reference/lexical_analysis.html for | |
# more details. | |
bytes_b = '你好' | |
# Type will be str, length will still be 6. Python doesn't decode | |
# them into two unicode chracters. | |
print 'OUTPUT 6' | |
print type(bytes_b), len(bytes_b) | |
# Let's print the first 3 bytes for the first character. They will | |
# be printed as \ooo as ooo is the octal value. | |
print 'OUTPUT 7' | |
print bytes_b[0], bytes_b[3], bytes_b[2] | |
# We can explicitly decode the bytes into 2 unicode characters. | |
unicode_b = bytes_b.decode('utf8') | |
# Now, the type is unicode, and the length is 2. | |
print 'OUTPUT 8' | |
print type(unicode_b), len(unicode_b) | |
# Let's print the 2 unicode characters. (Some details are hidden here, | |
# i.e., what happens during the printing? Why we can print both str | |
# and unicode characters correctly? Who is doing the underlying work?) | |
print 'OUTPUT 9' | |
print unicode_b[0], unicode_b[1] | |
# Now, we try to decode the bytes using ascii encoding, and it will fail! | |
# because ascii ranges from 0 to 127, and the bytes are larger than 127, | |
# which will cause an UnicodeDecodeError. | |
print 'OUTPUT 10' | |
try: | |
bytes_b.decode('ascii') | |
except UnicodeDecodeError as e: | |
print e | |
# Similarly, if we try to use ascii to encode the 2 unicode characters, | |
# it will also fail, resulting in a UnicodeEncodeError. | |
print 'OUTPUT 11' | |
try: | |
unicode_b.encode('ascii') | |
except UnicodeEncodeError as e: | |
print e | |
# We can encode the unicode strings back to bytes using utf8. | |
print 'OUTPUT 12' | |
print unicode_b.encode('utf8') | |
# They should be the same (containing exactly same 6 bytes). | |
print bytes_b == unicode_b.encode('utf8') | |
# OK, this is the last part. Escaping of the unicode charaters. By escaping | |
# I mean using '\ooo', '\xhh' or '\uxxxx' to represent characters in the | |
# string literal. They are NOT single-byte value on disk! '\ooo', '\xhh' | |
# will literally take 4 bytes each, and '\uxxxx' will take 6 bytes. So they | |
# are a higer level abstraction than encoding. Encoding is defined as how | |
# we convert between raw byte stream and string, but escaping is defined | |
# as how we use string (normally ascii characters) to represent a character | |
# (normally unicode characters). The values (ooo in octal, hh and xxxx in | |
# hex) are the absolute value! For example, \xx represent a 8-bit number, | |
# and \uxxxx represent a 16-bit number. The escaping/decoding function will | |
# translate these numbers to the actual character, if possible. | |
print 'OUTPUT 13' | |
# The decoding function hidden behind the literal 'u' knows 1) this should | |
# be a unicode string, 2) the 16-bit value then should be translated into | |
# the actual unicode character. How is this different from '你' or u'你'? | |
# '你' will be just 3 bytes, and u'你' will be translated into unicode | |
# character from the 3 bytes, instead of from the escaped \uxxxx value. | |
escaping_a = u'\u4f60' | |
print escaping_a, type(escaping_a), len(escaping_a) | |
# This will be just a simple byte string, without the 'u' literal. | |
print 'OUTPUT 14' | |
escaping_b = '\u4f60' | |
print escaping_b, type(escaping_b), len(escaping_b) | |
# We can also convert from escaped string to unicode characters. | |
print 'OUTPUT 15' | |
unicode_c = escaping_b.decode('unicode_escape') | |
print unicode_c, type(unicode_c), len(unicode_c) | |
# How about this? | |
print 'OUTPUT 16' | |
unicode_d = escaping_b.decode('ascii') | |
# It's a unicode string now. | |
print unicode_d, type(unicode_d), len(unicode_d) | |
# Decode the escaped string. | |
unicode_e = unicode_d.decode('unicode_escape') | |
# Now you see why escaping is a higher abstraction than encoding. | |
print unicode_e, type(unicode_e), len(unicode_e) | |
# One more thing. | |
print 'OUTPUT 17' | |
escaping_c = '\\u4f60' | |
print escaping_c, type(escaping_c), len(escaping_c) | |
# Why '\\u4f60' has the same length with '\u4f60'? They are the same! | |
print '\\u4f60' == '\u4f60' | |
# The reason is '\\' is eqaul to '\' when Python loads the them into memory. | |
# The escaping by '\u' happens after loading, at the string literal level! | |
# It happens like a second pass over the loaded string, either in bytes or | |
# unicode. | |
# How about this? It will fail and result in UnicodeEncodeError. Because | |
# the unicode character has 3 bytes, where the decode() function will | |
# operate on. And 'unicode_escape' assumes that they are ascii characters! | |
# So in essense, escaping means using ascii characters to represent unicode | |
# characters. When 'unicode_escape' does the converting, it first knows it's | |
# a unicode string, and then it tries to convert them to bytes first, using | |
# ascii as encoding. | |
print 'OUTPUT 18' | |
try: | |
u'\u4f60'.decode('unicode_escape') | |
except UnicodeEncodeError as e: | |
print e | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment