Skip to content

Instantly share code, notes, and snippets.

@juandebravo
Created August 3, 2013 23:23
Show Gist options
  • Save juandebravo/6148367 to your computer and use it in GitHub Desktop.
Save juandebravo/6148367 to your computer and use it in GitHub Desktop.
Check the representation of a set of characters using different encodings
# -*- coding: utf-8 -*-
import sys
if len(sys.argv) > 1:
code_points = [unicode(c, 'utf-8') for c in sys.argv[1:]]
else:
# Testing values
code_points = [u'\U0001F37A\U00000045\U0000039B', u'\U0001F37A']
#code_points = [unicode('❯❯', 'utf-8')]
def handle_encoding(encoding, code_point):
try:
values = ['{:>15}'.format(encoding),
' ---> ',
':'.join('{0:x}'.format(ord(c)) for c in
code_point.encode(encoding)),
' (', str(len(code_point.encode(encoding))), ')']
print ''.join(values)
except Exception as ex:
values = ['{:>15}'.format(encoding),
' ---> ',
'Unable to encode the codepoint in {0}'.format(encoding)]
print ''.join(values)
for code_point in code_points:
print '{:>15}'.format('character') + ' ---> ' + code_point
print '{:>15}'.format('code points') + ' ---> ' + repr(code_point)
for coding in ('ascii', 'latin-1', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le'):
handle_encoding(coding, code_point)
# python encoding_info.py
# character ---> 🍺EΛ
# code points ---> u'\U0001f37aE\u039b'
# ascii ---> Unable to encode the codepoint in ascii
# latin-1 ---> Unable to encode the codepoint in latin-1
# utf-8 ---> f0:9f:8d:ba:45:ce:9b (7)
# utf-16 ---> ff:fe:3c:d8:7a:df:45:0:9b:3 (10)
# utf-16be ---> d8:3c:df:7a:0:45:3:9b (8)
# utf-16le ---> 3c:d8:7a:df:45:0:9b:3 (8)
# character ---> 🍺
# code points ---> u'\U0001f37a'
# ascii ---> Unable to encode the codepoint in ascii
# latin-1 ---> Unable to encode the codepoint in latin-1
# utf-8 ---> f0:9f:8d:ba (4)
# utf-16 ---> ff:fe:3c:d8:7a:df (6)
# utf-16be ---> d8:3c:df:7a (4)
# utf-16le ---> 3c:d8:7a:df (4)
# python encoding_info.py "OLA KE ASE KOMO VA KE ASE"
# character ---> OLA KE ASE KOMO VA KE ASE
# code points ---> u'OLA KE ASE KOMO VA KE ASE'
# ascii ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
# latin-1 ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
# utf-8 ---> 4f:4c:41:20:4b:45:20:41:53:45:20:4b:4f:4d:4f:20:56:41:20:4b:45:20:41:53:45 (25)
# utf-16 ---> ff:fe:4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0 (52)
# utf-16be ---> 0:4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45 (50)
# utf-16le ---> 4f:0:4c:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0:20:0:4b:0:4f:0:4d:0:4f:0:20:0:56:0:41:0:20:0:4b:0:45:0:20:0:41:0:53:0:45:0 (50)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment