Skip to content

Instantly share code, notes, and snippets.

@theredpea
Last active July 14, 2022 19:44
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save theredpea/9772384 to your computer and use it in GitHub Desktop.
Save theredpea/9772384 to your computer and use it in GitHub Desktop.
Understanding UTF-8 Encoding algorithm
# - - coding: utf-8 - -
# This class http://www.w3.org/International/URLUTF8Encoder.java
# In Python
# To understand how
# 新 ("xin", 1st tone, means "new" in Mandarin)
# U+65B0 (Unicode codepoint http://unicode-table.com/en/search/?q=%E6%96%B0)
# %E6%96%B0 (URL encoding in UTF-8 http://www.url-encode-decode.com/)
# Interesting: Not fixed-width! "新" not same size as "T"
# Also remember Unicode != Encodings http://www.joelonsoftware.com/articles/Unicode.html
relevant_functions ={f.__name__:f.__doc__ for f in (ord, chr, unichr)}
relevant_links = ('http://stackoverflow.com/a/227472/1175496',)
relevant_knowledge =('0x7F == 0x007F',
'0x7f == 0x7F',
'0x7F == 127',
""">>>#Right-shifting divides by two then floors
>>> int(bin(10),2)
10
>>> int(bin(10>>1),2)
5
>>> int(bin(10>>2),2)
2
>>> int(bin(10>>3),2)
1
>>> int(bin(10>>4),2)
0""")
'=='.join(relevant_knowledge)
hex_range = '0123456789abcdef'
hex_gen = ('%'+f+s for f in hex_range for s in hex_range)
hex_tuple = tuple(hex_gen)
assert len(hex_tuple)==256
xin = u'新'
xin_unicode_point = '65B0'
#In Python, can lign up equalities and transitive (?) property means all are the same
assert ord(xin)==int(xin_unicode_point, 16) == 26032
import string
digs = string.digits + string.lowercase
def int2base(x, base):
"""http://stackoverflow.com/a/2267446/1175496"""
if x < 0: sign = -1
elif x==0: return '0'
else: sign = 1
x *= sign
digits = []
while x:
digits.append(digs[x % base])
x /= base
if sign < 0:
digits.append('-')
digits.reverse()
return ''.join(digits)
def encode(string, dr_yang=False):
"""
Encode a string to the "x-www-form-urlencoded" form, enhanced
with the UTF-8-in-URL proposal. This is what happens:
<ul>
<li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
and '0' through '9' remain the same.
<li><p>The unreserved characters - _ . ! ~ ' ( ) remain the same.
<li><p>The space character ' ' is converted into a plus sign '+'.
<li><p>All other ASCII characters are converted into the
3-character string "%xy", where xy is
the two-digit hexadecimal representation of the character
code
<li><p>All non-ASCII characters are encoded in two steps: first
to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
secondly each of these bytes is encoded as "%xx".
</ul>
@param s The string to be encoded
@return The encoded string"""
sbuf = []
for char in string:
#In Java, chr can be stored in int http://docs.oracle.com/javase/specs/jls/se7/html/jls-5.html jls-5.1.3
char_int = ord(char)
if ( ord('A') <= char_int <= ord('Z')
or ord('a') <= char_int <= ord('z')
or ord('0') <= char_int <= ord('0')):
sbuf+= char
elif char == ' ':
sbuf+='+'
elif char in '-_.!~ \()':
sbuf+= char
"""All non-ASCII characters are encoded in two steps: first
to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
secondly each of these bytes is encoded as "%xx"."""
#http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html
#http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding.html
utf8_algorithm = """
1)
If a code point is the U+0000...U+007F range, it can be viewed as a 7-bit integer, 0bxxxxxxx.
Map the code point into 1 byte with the first high order bit set to 0 as: B1 = 0b0xxxxxx.
2)
If a code point is the U+0080...U+07FF range, it can be viewed as a 11-bit integer, 0byyyyyxxxxxx.
Map the code point into 2 bytes with first 5 bits stored in the first byte
and last 6 bits in the second byte: as: B1 = 0b110yyyyy, B2 = 0b10xxxxxx.
3)
If a code point is the U+0800...U+FFFF range, it can be viewed as a 16-bit integer, 0bzzzzyyyyyyxxxxxx.
Map the code point into 3 bytes with first 4 bits stored in the first byte, next 6 bits in the second byte,
and last 6 bits in the third byte: as: B1 = 0b1110zzzz, B2 = 0b10yyyyyy, B3 = 0b10xxxxxx.
4)
If a code point is the U+10000...U+10FFFF range, it can be viewed as a 21-bit integer, 0bvvvzzzzzzyyyyyyxxxxxx.
Map the code point into 4 bytes with first 3 bits stored in the first byte, next 6 bits in the second byte,
another 6 bits in the third byte,
and last 6 bits in the fourth byte: as: B1 = 0b11110xxx, B2 = 0b10zzzzzz, B3 = 0b10yyyyyy, B4 = 0b10xxxxxx.
Copyright © 2014 by Dr. Herong Yang. All rights reserved."""
utf8_1 = len(int2base(0x0000,2)) == 1 and len(int2base(0x007F, 2)) == 7
utf8_2 = len(int2base(0x0080,2)) == 8 and len(int2base(0x07FF, 2)) == 11
utf8_3 = len(int2base(0x0800,2)) == 12 and len(int2base(0xFFFF, 2)) == 16
utf8_4 = len(int2base(0x10000,2)) == 17 and len(int2base(0x10FFFF, 2)) == 21
#The four buckets described above; that their bit representations are possible
assert utf8_1 and utf8_2 and utf8_3 and utf8_4
utf8_2_1_filler = int(int2base(6,2)) == 110
utf8_2_1_filler = int(int2base(0xc0,2)) == 110 * 10**5
utf8_2_2_filler = int(int2base(0x80,2)) == 10 * 10**6 #Also the fillter for 3_2, 3_3, 3_2, _4_2, 4_3, and 4_4
elif char_int <= 0x007f:
#TODO: Understand Dr. Yang's algorithm (not working, uncommented)
#vs Java algorithm (failing, commented)
if dr_yang:
sbuf+= hex_tuple[char_int>>6 & 0x1F | 0xC0]
sbuf+= hex_tuple[char_int>>0 & 0x3F | 0x80]
else:
sbuf+=hex_tuple[0xc0 | (char_int >> 6)]
sbuf+=hex_tuple[0x80 | (char_int & 0x3F)]
else:
if dr_yang:
sbuf+=hex_tuple[char_int>>12 & 0x0F | 0xE0]
sbuf+=hex_tuple[char_int>>6 & 0x3F | 0x80]
sbuf+=hex_tuple[char_int>>0 & 0x3F | 0x80]
else:
sbuf+=hex_tuple[0xe0 | (char_int >> 12)]
sbuf+=hex_tuple[0x80 | ((char_int >> 6) & 0x3F)]
sbuf+=hex_tuple[0x80 | (char_int & 0x3F)]
return ''.join(sbuf)
exp_ints = ('%E6','%96','%B0')
exp_enc_xin = ''.join(exp_ints)
enc_xin = encode(xin)
assert enc_xin.lower() == exp_enc_xin.lower(), '{0} is not {1}'.format(enc_xin, exp_enc_xin)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment