mtigas/multibyte_count.py

## multibyte_count.py
# Example of making sure that multibyte characters are counted
# properly with regard to APIs such as Twitter.
#
# See http://dev.twitter.com/pages/counting_characters
#
# Uses example from above link:
#     cafe (with diacritical):
#         0x63 0x61 0x66 0xC3 0xA9 (composed character)
#         0x63 0x61 0x66 0x65 0xCC 0x81 (e + combining diacritical)

from unicodedata import normalize

if __name__ == "__main__":
    print
    print "value\trepr\t\tlen"
    print

    a="\x63\x61\x66\x65\xCC\x81"
    b="\x63\x61\x66\xC3\xA9"

    print "----- str -----"

    print "%s\t%s\t%s" % (a, repr(a), len(a))
    print "%s\t%s\t%s" % (b, repr(b), len(b))

    print
    print "----- unicode -----"

    uni_a = unicode(a, 'utf-8')
    uni_b = unicode(b, 'utf-8')

    print "%s\t%s\t%s" % (uni_a, repr(uni_a), len(uni_a))
    print "%s\t%s\t%s" % (uni_b, repr(uni_b), len(uni_b))

    print
    print "----- Unicode Normalization Form C -----"

    norm_a = normalize("NFC", uni_a)
    norm_b = normalize("NFC", uni_b)

    print "%s\t%s\t%s" % (norm_a, repr(norm_a), len(norm_a))
    print "%s\t%s\t%s" % (norm_b, repr(norm_b), len(norm_b))
    print

## output.txt
value	repr		len

----- str -----
café	'cafe\xcc\x81'	6
café	'caf\xc3\xa9'	5

----- unicode -----
café	u'cafe\u0301'	5
café	u'caf\xe9'	4

----- Unicode Normalization Form C -----
café	u'caf\xe9'	4
café	u'caf\xe9'	4
	# Example of making sure that multibyte characters are counted
	# properly with regard to APIs such as Twitter.
	#
	# See http://dev.twitter.com/pages/counting_characters
	#
	# Uses example from above link:
	# cafe (with diacritical):
	# 0x63 0x61 0x66 0xC3 0xA9 (composed character)
	# 0x63 0x61 0x66 0x65 0xCC 0x81 (e + combining diacritical)

	from unicodedata import normalize

	if __name__ == "__main__":
	print
	print "value\trepr\t\tlen"
	print

	a="\x63\x61\x66\x65\xCC\x81"
	b="\x63\x61\x66\xC3\xA9"

	print "----- str -----"

	print "%s\t%s\t%s" % (a, repr(a), len(a))
	print "%s\t%s\t%s" % (b, repr(b), len(b))

	print
	print "----- unicode -----"

	uni_a = unicode(a, 'utf-8')
	uni_b = unicode(b, 'utf-8')

	print "%s\t%s\t%s" % (uni_a, repr(uni_a), len(uni_a))
	print "%s\t%s\t%s" % (uni_b, repr(uni_b), len(uni_b))

	print
	print "----- Unicode Normalization Form C -----"

	norm_a = normalize("NFC", uni_a)
	norm_b = normalize("NFC", uni_b)

	print "%s\t%s\t%s" % (norm_a, repr(norm_a), len(norm_a))
	print "%s\t%s\t%s" % (norm_b, repr(norm_b), len(norm_b))
	print
	value repr len

	----- str -----
	café 'cafe\xcc\x81' 6
	café 'caf\xc3\xa9' 5

	----- unicode -----
	café u'cafe\u0301' 5
	café u'caf\xe9' 4

	----- Unicode Normalization Form C -----
	café u'caf\xe9' 4
	café u'caf\xe9' 4