Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
convert hex utf-8 bytes to hex Unicode number
import sys
def ch2int(ch):
if '0' <= ch <= '9':
return ord(ch) - ord('0')
if 'a' <= ch <= 'f':
return ord(ch) - ord('a') + 10
if 'A' <= ch <= 'F':
return ord(ch) - ord('A') + 10
return 0
def utf8_to_unicode(utf8):
w = utf8
if not w or not w.lower().startswith('0x'):
return None
w = w[2:]
if len(w) % 2 == 1:
w = '0' + w
if len(w) < 2:
return None
c = (ch2int(w[0]) << 4) + ch2int(w[1])
w = w[2:]
if (c & 0b10000000) == 0:
c &= ~0b10000000
nb = 0
elif (c & 0b11100000) == 0b11000000:
c &= ~0b11100000
nb = 1
elif (c & 0b11110000) == 0b11100000:
c &= ~0b11110000
nb = 2
elif (c & 0b11111000) == 0b11110000:
c &= ~0b11111000
nb = 3
else:
return None
while nb > 0:
if len(w) < 2:
return None
cc = (ch2int(w[0]) << 4) + ch2int(w[1])
w = w[2:]
if (cc & 0b11000000) != 0b10000000:
return None
cc &= ~0b11000000
c <<= 6
c += cc
nb -= 1
if w:
return None
c = hex(c)[2:].upper()
c = ('0000' + c)[-4:]
c = 'U+' + c
return c
# $ python ./utf8_to_unicode.py 0xe282ac 0x24 0xc2a2 0xe0a4b9 0xed959c 0xf0908d8888 0x0
# ('0xe282ac', '-->', 'U+20AC')
# ('0x24', '-->', 'U+0024')
# ('0xc2a2', '-->', 'U+00A2')
# ('0xe0a4b9', '-->', 'U+0939')
# ('0xed959c', '-->', 'U+D55C')
# ('0xf0908d8888', '-->', None)
# ('0x0', '-->', 'U+0000')
if __name__ == "__main__":
for w in sys.argv[1:]:
print(w, '-->', utf8_to_unicode(w))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.