treyhunner/utf8ify.py

## utf8ify.py
"""
Just some code that needlessly converts unicode codepoints to UTF-8.

Example:

    $ python utf8ify.py U+2728
    Bytes: 0xe2 0x9c 0xa8
    Text: ✨

    $ python utf8ify.py U+1F3F3 U+FE0F U+200D U+1F308
    Bytes: 0xf0 0x9f 0x8f 0xb3 0xef 0xb8 0x8f 0xe2 0x80 0x8d 0xf0 0x9f 0x8c 0x88
    Text: 🏳️‍🌈

"""
from collections import deque
import sys


LAST_6_BITS = 0b0011_1111
LAST_7_BITS = 0b0111_1111
TAIL_PREFIX = 0b1000_0000
HEADERS = {
    2: 0b1100_0000,
    3: 0b1110_0000,
    4: 0b1111_0000,
}


def parse_codepoint_string(string):
    """Convert codepoint string to number (e.g. U+2728 to 0x2728)."""
    return int(string.removeprefix("U+"), 16)


def codepoint_to_utf8(codepoint):
    """
    Convert codepoint number (e.g. 0x2728) to UTF-8 bytes.

    Example::

        >>> codepoint_to_utf8(0x2728)
        b'\xe2\x9c\xa8'
    """
    if codepoint.bit_length() <= 7:  # ASCII
        return bytes([codepoint])
    elif codepoint.bit_length() <= 11:
        octet_count = 2
    elif codepoint.bit_length() <= 16:
        octet_count = 3
    elif codepoint.bit_length() <= 21:
        octet_count = 4
    else:
        raise ValueError("Invalid codepoint")
    octets = deque()
    for n in range(octet_count-1):
        octets.appendleft(TAIL_PREFIX | codepoint & LAST_6_BITS)
        codepoint = codepoint >> 6
    octets.appendleft(HEADERS[octet_count] | codepoint)
    return bytes(octets)


def parse_codepoints(string):
    """
    Parse string of space-separated UTF-8 codepoints (e.g. U+2728).

    Example:

        >>> parse_codepoints("U+1F3F3 U+FE0F U+200D U+1F308")
        b'\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88'
    """
    return b"".join([
        codepoint_to_utf8(parse_codepoint_string(substring))
        for substring in string.split()
    ])


def parse_codepoints_but_easy(string):
    """Same function as above, but without re-implementing the wheel."""
    return b"".join([
        bytes([parse_codepoint_string(substring)])
        for substring in string.split()
    ])


def codepoints_from_text(string):
    """
    Convert string to string of codepoints (starting with U+).

    Example:

        >>> codepoints_from_text("🌈🦆🇦🇶")
        'U+1F308 U+1F986 U+1F1E6 U+1F1F
    """
    return " ".join([
        f"U+{ord(character):X}"
        for character in string
    ])


if __name__ == "__main__":
    utf8_bytes = parse_codepoints(" ".join(sys.argv[1:]))
    print("Bytes:", *[f"{byte:#x}" for byte in utf8_bytes])
    print("Text:", utf8_bytes.decode("utf-8"))
	"""
	Just some code that needlessly converts unicode codepoints to UTF-8.

	Example:

	$ python utf8ify.py U+2728
	Bytes: 0xe2 0x9c 0xa8
	Text: ✨

	$ python utf8ify.py U+1F3F3 U+FE0F U+200D U+1F308
	Bytes: 0xf0 0x9f 0x8f 0xb3 0xef 0xb8 0x8f 0xe2 0x80 0x8d 0xf0 0x9f 0x8c 0x88
	Text: 🏳️‍🌈

	"""
	from collections import deque
	import sys


	LAST_6_BITS = 0b0011_1111
	LAST_7_BITS = 0b0111_1111
	TAIL_PREFIX = 0b1000_0000
	HEADERS = {
	2: 0b1100_0000,
	3: 0b1110_0000,
	4: 0b1111_0000,
	}


	def parse_codepoint_string(string):
	"""Convert codepoint string to number (e.g. U+2728 to 0x2728)."""
	return int(string.removeprefix("U+"), 16)


	def codepoint_to_utf8(codepoint):
	"""
	Convert codepoint number (e.g. 0x2728) to UTF-8 bytes.

	Example::

	>>> codepoint_to_utf8(0x2728)
	b'\xe2\x9c\xa8'
	"""
	if codepoint.bit_length() <= 7: # ASCII
	return bytes([codepoint])
	elif codepoint.bit_length() <= 11:
	octet_count = 2
	elif codepoint.bit_length() <= 16:
	octet_count = 3
	elif codepoint.bit_length() <= 21:
	octet_count = 4
	else:
	raise ValueError("Invalid codepoint")
	octets = deque()
	for n in range(octet_count-1):
	octets.appendleft(TAIL_PREFIX \| codepoint & LAST_6_BITS)
	codepoint = codepoint >> 6
	octets.appendleft(HEADERS[octet_count] \| codepoint)
	return bytes(octets)


	def parse_codepoints(string):
	"""
	Parse string of space-separated UTF-8 codepoints (e.g. U+2728).

	Example:

	>>> parse_codepoints("U+1F3F3 U+FE0F U+200D U+1F308")
	b'\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88'
	"""
	return b"".join([
	codepoint_to_utf8(parse_codepoint_string(substring))
	for substring in string.split()
	])


	def parse_codepoints_but_easy(string):
	"""Same function as above, but without re-implementing the wheel."""
	return b"".join([
	bytes([parse_codepoint_string(substring)])
	for substring in string.split()
	])


	def codepoints_from_text(string):
	"""
	Convert string to string of codepoints (starting with U+).

	Example:

	>>> codepoints_from_text("🌈🦆🇦🇶")
	'U+1F308 U+1F986 U+1F1E6 U+1F1F
	"""
	return " ".join([
	f"U+{ord(character):X}"
	for character in string
	])


	if __name__ == "__main__":
	utf8_bytes = parse_codepoints(" ".join(sys.argv[1:]))
	print("Bytes:", *[f"{byte:#x}" for byte in utf8_bytes])
	print("Text:", utf8_bytes.decode("utf-8"))