Skip to content

Instantly share code, notes, and snippets.

@treyhunner
Last active September 22, 2022 18:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save treyhunner/3c517975af26ab1676ab9fb3a6c7c681 to your computer and use it in GitHub Desktop.
Save treyhunner/3c517975af26ab1676ab9fb3a6c7c681 to your computer and use it in GitHub Desktop.
I implemented code to convert unicode code points to UTF-8, just for fun. Inspired by https://sethmlarson.dev/blog/utf-8
"""
Just some code that needlessly converts unicode codepoints to UTF-8.
Example:
$ python utf8ify.py U+2728
Bytes: 0xe2 0x9c 0xa8
Text: ✨
$ python utf8ify.py U+1F3F3 U+FE0F U+200D U+1F308
Bytes: 0xf0 0x9f 0x8f 0xb3 0xef 0xb8 0x8f 0xe2 0x80 0x8d 0xf0 0x9f 0x8c 0x88
Text: 🏳️‍🌈
"""
from collections import deque
import sys
LAST_6_BITS = 0b0011_1111
LAST_7_BITS = 0b0111_1111
TAIL_PREFIX = 0b1000_0000
HEADERS = {
2: 0b1100_0000,
3: 0b1110_0000,
4: 0b1111_0000,
}
def parse_codepoint_string(string):
"""Convert codepoint string to number (e.g. U+2728 to 0x2728)."""
return int(string.removeprefix("U+"), 16)
def codepoint_to_utf8(codepoint):
"""
Convert codepoint number (e.g. 0x2728) to UTF-8 bytes.
Example::
>>> codepoint_to_utf8(0x2728)
b'\xe2\x9c\xa8'
"""
if codepoint.bit_length() <= 7: # ASCII
return bytes([codepoint])
elif codepoint.bit_length() <= 11:
octet_count = 2
elif codepoint.bit_length() <= 16:
octet_count = 3
elif codepoint.bit_length() <= 21:
octet_count = 4
else:
raise ValueError("Invalid codepoint")
octets = deque()
for n in range(octet_count-1):
octets.appendleft(TAIL_PREFIX | codepoint & LAST_6_BITS)
codepoint = codepoint >> 6
octets.appendleft(HEADERS[octet_count] | codepoint)
return bytes(octets)
def parse_codepoints(string):
"""
Parse string of space-separated UTF-8 codepoints (e.g. U+2728).
Example:
>>> parse_codepoints("U+1F3F3 U+FE0F U+200D U+1F308")
b'\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88'
"""
return b"".join([
codepoint_to_utf8(parse_codepoint_string(substring))
for substring in string.split()
])
def parse_codepoints_but_easy(string):
"""Same function as above, but without re-implementing the wheel."""
return b"".join([
bytes([parse_codepoint_string(substring)])
for substring in string.split()
])
def codepoints_from_text(string):
"""
Convert string to string of codepoints (starting with U+).
Example:
>>> codepoints_from_text("🌈🦆🇦🇶")
'U+1F308 U+1F986 U+1F1E6 U+1F1F
"""
return " ".join([
f"U+{ord(character):X}"
for character in string
])
if __name__ == "__main__":
utf8_bytes = parse_codepoints(" ".join(sys.argv[1:]))
print("Bytes:", *[f"{byte:#x}" for byte in utf8_bytes])
print("Text:", utf8_bytes.decode("utf-8"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment