juandebravo/encodings.py

## encodings.py
# -*- coding: utf-8 -*-
import sys

if len(sys.argv) > 1:
    code_points = [unicode(c, 'utf-8') for c in sys.argv[1:]]
else:
    # Testing values
    code_points = [u'\U0001F37A\U00000045\U0000039B', u'\U0001F37A']
    #code_points = [unicode('❯❯', 'utf-8')]

def handle_encoding(encoding, code_point):
    try:
        values = ['{:>15}'.format(encoding),
                  ' ---> ',
                  ':'.join('{0:x}'.format(ord(c)) for c in
                  code_point.encode(encoding)),
                  ' (', str(len(code_point.encode(encoding))), ')']
        print ''.join(values)
    except Exception as ex:
        values = ['{:>15}'.format(encoding),
                  ' ---> ',
                  'Unable to encode the codepoint in {0}'.format(encoding)]
        print ''.join(values)

for code_point in code_points:
    print '{:>15}'.format('character') + ' ---> ' + code_point
    print '{:>15}'.format('code points') + ' ---> ' + repr(code_point)
    for coding in ('ascii', 'latin-1', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le'):
        handle_encoding(coding, code_point)


# python encoding_info.py
#    character ---> 🍺EΛ
#    code points ---> u'\U0001f37aE\u039b'
#          ascii ---> Unable to encode the codepoint in ascii
#        latin-1 ---> Unable to encode the codepoint in latin-1
#          utf-8 ---> f0:9f:8d:ba:45:ce:9b (7)
#         utf-16 ---> ff:fe:3c:d8:7a:df:45:0:9b:3 (10)
#       utf-16be ---> d8:3c:df:7a:0:45:3:9b (8)
#       utf-16le ---> 3c:d8:7a:df:45:0:9b:3 (8)
#      character ---> 🍺
#    code points ---> u'\U0001f37a'
#          ascii ---> Unable to encode the codepoint in ascii
#        latin-1 ---> Unable to encode the codepoint in latin-1
#          utf-8 ---> f0:9f:8d:ba (4)
#         utf-16 ---> ff:fe:3c:d8:7a:df (6)
#       utf-16be ---> d8:3c:df:7a (4)
#       utf-16le ---> 3c:d8:7a:df (4)


# python encoding_info.py "OLA KE ASE KOMO VA KE ASE"
#      character ---> OLA KE ASE KOMO VA KE ASE
#    code points ---> u'OLA KE ASE KOMO VA KE ASE'
#          ascii ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
#        latin-1 ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
#          utf-8 ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
#         utf-16 ---> ff:fe:4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0 (52)
#       utf-16be ---> 0:4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45 (50)
#       utf-16le ---> 4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0 (50)
	# -- coding: utf-8 --
	import sys

	if len(sys.argv) > 1:
	code_points = [unicode(c, 'utf-8') for c in sys.argv[1:]]
	else:
	# Testing values
	code_points = [u'\U0001F37A\U00000045\U0000039B', u'\U0001F37A']
	#code_points = [unicode('❯❯', 'utf-8')]

	def handle_encoding(encoding, code_point):
	try:
	values = ['{:>15}'.format(encoding),
	' ---> ',
	':'.join('{0:x}'.format(ord(c)) for c in
	code_point.encode(encoding)),
	' (', str(len(code_point.encode(encoding))), ')']
	print ''.join(values)
	except Exception as ex:
	values = ['{:>15}'.format(encoding),
	' ---> ',
	'Unable to encode the codepoint in {0}'.format(encoding)]
	print ''.join(values)

	for code_point in code_points:
	print '{:>15}'.format('character') + ' ---> ' + code_point
	print '{:>15}'.format('code points') + ' ---> ' + repr(code_point)
	for coding in ('ascii', 'latin-1', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le'):
	handle_encoding(coding, code_point)


	# python encoding_info.py
	# character ---> 🍺EΛ
	# code points ---> u'\U0001f37aE\u039b'
	# ascii ---> Unable to encode the codepoint in ascii
	# latin-1 ---> Unable to encode the codepoint in latin-1
	# utf-8 ---> f0:9f:8d:ba:45:ce:9b (7)
	# utf-16 ---> ff:fe:3c:d8:7a:df:45:0:9b:3 (10)
	# utf-16be ---> d8:3c:df:7a:0:45:3:9b (8)
	# utf-16le ---> 3c:d8:7a:df:45:0:9b:3 (8)
	# character ---> 🍺
	# code points ---> u'\U0001f37a'
	# ascii ---> Unable to encode the codepoint in ascii
	# latin-1 ---> Unable to encode the codepoint in latin-1
	# utf-8 ---> f0:9f:8d:ba (4)
	# utf-16 ---> ff:fe:3c:d8:7a:df (6)
	# utf-16be ---> d8:3c:df:7a (4)
	# utf-16le ---> 3c:d8:7a:df (4)


	# python encoding_info.py "OLA KE ASE KOMO VA KE ASE"
	# character ---> OLA KE ASE KOMO VA KE ASE
	# code points ---> u'OLA KE ASE KOMO VA KE ASE'
	# ascii ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
	# latin-1 ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
	# utf-8 ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
	# utf-16 ---> ff:fe:4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0 (52)
	# utf-16be ---> 0:4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45 (50)
	# utf-16le ---> 4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0 (50)