theredpea/utf_8_encoding_for_url_encoding.py

## utf_8_encoding_for_url_encoding.py
#   -  - coding: utf-8 -  -

#  This class http://www.w3.org/International/URLUTF8Encoder.java
#  In Python
#  To understand how
#  新                  ("xin", 1st tone, means "new" in Mandarin)
#  U+65B0             (Unicode codepoint http://unicode-table.com/en/search/?q=%E6%96%B0)
#  %E6%96%B0          (URL encoding in UTF-8 http://www.url-encode-decode.com/)

#  Interesting: Not fixed-width! "新" not same size as "T"
#  Also remember Unicode != Encodings   http://www.joelonsoftware.com/articles/Unicode.html

relevant_functions  ={f.__name__:f.__doc__ for f in (ord, chr, unichr)}

relevant_links      = ('http://stackoverflow.com/a/227472/1175496',)

relevant_knowledge  =('0x7F == 0x007F',
                      '0x7f == 0x7F',
                      '0x7F == 127',
                      """>>>#Right-shifting divides by two then floors
>>> int(bin(10),2)
10
>>> int(bin(10>>1),2)
5
>>> int(bin(10>>2),2)
2
>>> int(bin(10>>3),2)
1
>>> int(bin(10>>4),2)
0""")

'=='.join(relevant_knowledge)

hex_range           = '0123456789abcdef'
hex_gen             = ('%'+f+s for f in hex_range for s in hex_range)
hex_tuple           = tuple(hex_gen)

assert len(hex_tuple)==256

xin                 = u'新'
xin_unicode_point   = '65B0'

#In Python, can lign up equalities and transitive (?) property means all are the same
assert ord(xin)==int(xin_unicode_point, 16) == 26032

import string
digs = string.digits + string.lowercase
def int2base(x, base):
    """http://stackoverflow.com/a/2267446/1175496"""
    if x < 0: sign = -1
    elif x==0: return '0'
    else: sign = 1
    x *= sign
    digits = []
    while x:
        digits.append(digs[x % base])
        x /= base
    if sign < 0:
        digits.append('-')
    digits.reverse()
    return ''.join(digits)


def encode(string, dr_yang=False):
    """
      Encode a string to the "x-www-form-urlencoded" form, enhanced
      with the UTF-8-in-URL proposal. This is what happens:

      <ul>
      <li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
             and '0' through '9' remain the same.

      <li><p>The unreserved characters - _ . ! ~    ' ( ) remain the same.

      <li><p>The space character ' ' is converted into a plus sign '+'.

      <li><p>All other ASCII characters are converted into the
             3-character string "%xy", where xy is
             the two-digit hexadecimal representation of the character
             code

      <li><p>All non-ASCII characters are encoded in two steps: first
             to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
             secondly each of these bytes is encoded as "%xx".
      </ul>

      @param s The string to be encoded
      @return The encoded string"""

    sbuf = []
    for char in string:
        #In Java, chr can be stored in int http://docs.oracle.com/javase/specs/jls/se7/html/jls-5.html  jls-5.1.3
        char_int = ord(char)
        if  (   ord('A') <= char_int <= ord('Z')
            or  ord('a') <= char_int <= ord('z')
            or  ord('0') <= char_int <= ord('0')):
            sbuf+= char
        elif char == ' ':
            sbuf+='+'
        elif char in '-_.!~  \()':
            sbuf+= char

            """All non-ASCII characters are encoded in two steps: first
                 to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
                 secondly each of these bytes is encoded as "%xx"."""

            #http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html
            #http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding.html

            utf8_algorithm = """
    1)
    If a code point is the U+0000...U+007F range, it can be viewed as a 7-bit integer, 0bxxxxxxx.
    Map the code point into 1 byte with the first high order bit set to 0 as: B1 = 0b0xxxxxx.

    2)
    If a code point is the U+0080...U+07FF range, it can be viewed as a 11-bit integer, 0byyyyyxxxxxx.
    Map the code point into 2 bytes with first 5 bits stored in the first byte
    and last 6 bits in the second byte: as: B1 = 0b110yyyyy, B2 = 0b10xxxxxx.

    3)
    If a code point is the U+0800...U+FFFF range, it can be viewed as a 16-bit integer, 0bzzzzyyyyyyxxxxxx.
    Map the code point into 3 bytes with first 4 bits stored in the first byte, next 6 bits in the second byte,
    and last 6 bits in the third byte: as: B1 = 0b1110zzzz, B2 = 0b10yyyyyy, B3 = 0b10xxxxxx.

    4)
    If a code point is the U+10000...U+10FFFF range, it can be viewed as a 21-bit integer, 0bvvvzzzzzzyyyyyyxxxxxx.
    Map the code point into 4 bytes with first 3 bits stored in the first byte, next 6 bits in the second byte,
    another 6 bits in the third byte,
    and last 6 bits in the fourth byte: as: B1 = 0b11110xxx, B2 = 0b10zzzzzz, B3 = 0b10yyyyyy, B4 = 0b10xxxxxx.

    Copyright © 2014 by Dr. Herong Yang. All rights reserved."""

            utf8_1 = len(int2base(0x0000,2))    == 1   and len(int2base(0x007F, 2))     == 7
            utf8_2 = len(int2base(0x0080,2))    == 8   and len(int2base(0x07FF, 2))     == 11
            utf8_3 = len(int2base(0x0800,2))    == 12  and len(int2base(0xFFFF, 2))     == 16
            utf8_4 = len(int2base(0x10000,2))   == 17  and len(int2base(0x10FFFF, 2))   == 21

            #The four buckets described above; that their bit representations are possible
            assert utf8_1 and utf8_2 and utf8_3 and utf8_4

            utf8_2_1_filler = int(int2base(6,2))    == 110
            utf8_2_1_filler = int(int2base(0xc0,2)) == 110      * 10**5
            utf8_2_2_filler = int(int2base(0x80,2)) == 10       * 10**6 #Also the fillter for 3_2, 3_3, 3_2, _4_2, 4_3, and 4_4


        elif char_int <= 0x007f:
            #TODO: Understand Dr. Yang's algorithm (not working, uncommented)
            #vs Java algorithm (failing, commented)
            if dr_yang:
                sbuf+= hex_tuple[char_int>>6  & 0x1F | 0xC0]
                sbuf+= hex_tuple[char_int>>0  & 0x3F | 0x80]
            else:
                sbuf+=hex_tuple[0xc0 | (char_int >> 6)]
                sbuf+=hex_tuple[0x80 | (char_int & 0x3F)]
        else:
            if dr_yang:
                sbuf+=hex_tuple[char_int>>12 & 0x0F | 0xE0]
                sbuf+=hex_tuple[char_int>>6  & 0x3F | 0x80]
                sbuf+=hex_tuple[char_int>>0  & 0x3F | 0x80]
            else:
                sbuf+=hex_tuple[0xe0 | (char_int >> 12)]
                sbuf+=hex_tuple[0x80 | ((char_int >> 6) & 0x3F)]
                sbuf+=hex_tuple[0x80 | (char_int & 0x3F)]

    return ''.join(sbuf)

exp_ints = ('%E6','%96','%B0')
exp_enc_xin = ''.join(exp_ints)
enc_xin = encode(xin)

assert enc_xin.lower() == exp_enc_xin.lower(), '{0} is not {1}'.format(enc_xin, exp_enc_xin)
	# - - coding: utf-8 - -

	# This class http://www.w3.org/International/URLUTF8Encoder.java
	# In Python
	# To understand how
	# 新 ("xin", 1st tone, means "new" in Mandarin)
	# U+65B0 (Unicode codepoint http://unicode-table.com/en/search/?q=%E6%96%B0)
	# %E6%96%B0 (URL encoding in UTF-8 http://www.url-encode-decode.com/)

	# Interesting: Not fixed-width! "新" not same size as "T"
	# Also remember Unicode != Encodings http://www.joelonsoftware.com/articles/Unicode.html

	relevant_functions ={f.__name__:f.__doc__ for f in (ord, chr, unichr)}

	relevant_links = ('http://stackoverflow.com/a/227472/1175496',)

	relevant_knowledge =('0x7F == 0x007F',
	'0x7f == 0x7F',
	'0x7F == 127',
	""">>>#Right-shifting divides by two then floors
	>>> int(bin(10),2)
	10
	>>> int(bin(10>>1),2)
	5
	>>> int(bin(10>>2),2)
	2
	>>> int(bin(10>>3),2)
	1
	>>> int(bin(10>>4),2)
	0""")

	'=='.join(relevant_knowledge)

	hex_range = '0123456789abcdef'
	hex_gen = ('%'+f+s for f in hex_range for s in hex_range)
	hex_tuple = tuple(hex_gen)

	assert len(hex_tuple)==256

	xin = u'新'
	xin_unicode_point = '65B0'

	#In Python, can lign up equalities and transitive (?) property means all are the same
	assert ord(xin)==int(xin_unicode_point, 16) == 26032

	import string
	digs = string.digits + string.lowercase
	def int2base(x, base):
	"""http://stackoverflow.com/a/2267446/1175496"""
	if x < 0: sign = -1
	elif x==0: return '0'
	else: sign = 1
	x *= sign
	digits = []
	while x:
	digits.append(digs[x % base])
	x /= base
	if sign < 0:
	digits.append('-')
	digits.reverse()
	return ''.join(digits)


	def encode(string, dr_yang=False):
	"""
	Encode a string to the "x-www-form-urlencoded" form, enhanced
	with the UTF-8-in-URL proposal. This is what happens:

	<ul>
	<li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
	and '0' through '9' remain the same.

	<li><p>The unreserved characters - _ . ! ~ ' ( ) remain the same.

	<li><p>The space character ' ' is converted into a plus sign '+'.

	<li><p>All other ASCII characters are converted into the
	3-character string "%xy", where xy is
	the two-digit hexadecimal representation of the character
	code

	<li><p>All non-ASCII characters are encoded in two steps: first
	to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
	secondly each of these bytes is encoded as "%xx".
	</ul>

	@param s The string to be encoded
	@return The encoded string"""

	sbuf = []
	for char in string:
	#In Java, chr can be stored in int http://docs.oracle.com/javase/specs/jls/se7/html/jls-5.html jls-5.1.3
	char_int = ord(char)
	if ( ord('A') <= char_int <= ord('Z')
	or ord('a') <= char_int <= ord('z')
	or ord('0') <= char_int <= ord('0')):
	sbuf+= char
	elif char == ' ':
	sbuf+='+'
	elif char in '-_.!~ \()':
	sbuf+= char

	"""All non-ASCII characters are encoded in two steps: first
	to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
	secondly each of these bytes is encoded as "%xx"."""

	#http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html
	#http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding.html

	utf8_algorithm = """
	1)
	If a code point is the U+0000...U+007F range, it can be viewed as a 7-bit integer, 0bxxxxxxx.
	Map the code point into 1 byte with the first high order bit set to 0 as: B1 = 0b0xxxxxx.

	2)
	If a code point is the U+0080...U+07FF range, it can be viewed as a 11-bit integer, 0byyyyyxxxxxx.
	Map the code point into 2 bytes with first 5 bits stored in the first byte
	and last 6 bits in the second byte: as: B1 = 0b110yyyyy, B2 = 0b10xxxxxx.

	3)
	If a code point is the U+0800...U+FFFF range, it can be viewed as a 16-bit integer, 0bzzzzyyyyyyxxxxxx.
	Map the code point into 3 bytes with first 4 bits stored in the first byte, next 6 bits in the second byte,
	and last 6 bits in the third byte: as: B1 = 0b1110zzzz, B2 = 0b10yyyyyy, B3 = 0b10xxxxxx.

	4)
	If a code point is the U+10000...U+10FFFF range, it can be viewed as a 21-bit integer, 0bvvvzzzzzzyyyyyyxxxxxx.
	Map the code point into 4 bytes with first 3 bits stored in the first byte, next 6 bits in the second byte,
	another 6 bits in the third byte,
	and last 6 bits in the fourth byte: as: B1 = 0b11110xxx, B2 = 0b10zzzzzz, B3 = 0b10yyyyyy, B4 = 0b10xxxxxx.

	Copyright © 2014 by Dr. Herong Yang. All rights reserved."""

	utf8_1 = len(int2base(0x0000,2)) == 1 and len(int2base(0x007F, 2)) == 7
	utf8_2 = len(int2base(0x0080,2)) == 8 and len(int2base(0x07FF, 2)) == 11
	utf8_3 = len(int2base(0x0800,2)) == 12 and len(int2base(0xFFFF, 2)) == 16
	utf8_4 = len(int2base(0x10000,2)) == 17 and len(int2base(0x10FFFF, 2)) == 21

	#The four buckets described above; that their bit representations are possible
	assert utf8_1 and utf8_2 and utf8_3 and utf8_4

	utf8_2_1_filler = int(int2base(6,2)) == 110
	utf8_2_1_filler = int(int2base(0xc0,2)) == 110 * 10**5
	utf8_2_2_filler = int(int2base(0x80,2)) == 10 * 10**6 #Also the fillter for 3_2, 3_3, 3_2, _4_2, 4_3, and 4_4


	elif char_int <= 0x007f:
	#TODO: Understand Dr. Yang's algorithm (not working, uncommented)
	#vs Java algorithm (failing, commented)
	if dr_yang:
	sbuf+= hex_tuple[char_int>>6 & 0x1F \| 0xC0]
	sbuf+= hex_tuple[char_int>>0 & 0x3F \| 0x80]
	else:
	sbuf+=hex_tuple[0xc0 \| (char_int >> 6)]
	sbuf+=hex_tuple[0x80 \| (char_int & 0x3F)]
	else:
	if dr_yang:
	sbuf+=hex_tuple[char_int>>12 & 0x0F \| 0xE0]
	sbuf+=hex_tuple[char_int>>6 & 0x3F \| 0x80]
	sbuf+=hex_tuple[char_int>>0 & 0x3F \| 0x80]
	else:
	sbuf+=hex_tuple[0xe0 \| (char_int >> 12)]
	sbuf+=hex_tuple[0x80 \| ((char_int >> 6) & 0x3F)]
	sbuf+=hex_tuple[0x80 \| (char_int & 0x3F)]

	return ''.join(sbuf)

	exp_ints = ('%E6','%96','%B0')
	exp_enc_xin = ''.join(exp_ints)
	enc_xin = encode(xin)

	assert enc_xin.lower() == exp_enc_xin.lower(), '{0} is not {1}'.format(enc_xin, exp_enc_xin)