wjandrea/flag_homoglyph_utf8.py

## flag_homoglyph_utf8.py
#!/usr/bin/env python3
"""
Use Unicode flag emoji homoglyphs to re-encode UTF-8.

To encode, a string is first encoded as UTF-8, then each byte is
broken down into bits, and each bit is encoded as a flag:
The more well-known country flag in each pair represents 0, and the
less well-known one represents 1. Pairs are cycled through.

There are 8 pairs total, which is the most I could find without repeats.
(Norway has two homoglyphs, so I only included one.)

Note that if any of these countries change their flags in the future,
this'll stop working, because the glyphs aren't specified in Unicode.
It's implementation-dependent what each REGIONAL INDICATOR SYMBOL
sequence represents and what it looks like.

Created for Puzzling Stack Exchange and posted there:
https://puzzling.stackexchange.com/q/116097/63368
"""

from itertools import cycle as _cycle
from typing import (
    Dict as _Dict,
    Iterable as _Iterable,
    List as _List,
    Tuple as _Tuple,
    )


def _convert_country_code_to_ris(country_code: str) -> str:
    r"""
    Convert ASCII uppercase letters to REGIONAL INDICATOR SYMBOLs.

    >>> _convert_country_code_to_ris('CA')
    '\U0001f1e8\U0001f1e6'
    """
    assert all('A' <= c <= 'Z' for c in country_code)
    shift: int = ord('\N{REGIONAL INDICATOR SYMBOL LETTER A}') - ord('A')
    return ''.join(chr(ord(c)+shift) for c in country_code)


_COUNTRY_CODES: _List[_Tuple[str, str]] = [
    ('RO', 'TD'),  # Romania and Chad
    ('ID', 'MC'),  # Indonesia and Monaco - Not identical, but close enough
    ('US', 'UM'),  # USA and USA Minor Outlying Islands
    ('NO', 'SJ'),  # Norway and Svalbard & Jan Mayen
    ('FR', 'MF'),  # France and St. Martin
    ('AU', 'HM'),  # Australia and Heard & McDonald Islands
    ('ES', 'EA'),  # Spain and Ceuta & Melilla
    ('FR', 'CP'),  # France and Clipperton Island
    # ('NO', 'BV'),  # Norway and Bouvet Island
    ]

FLAG_PAIRS: _List[_Tuple[str, str]] = [
    (_convert_country_code_to_ris(t0), _convert_country_code_to_ris(t1))
    for t0, t1 in _COUNTRY_CODES]


def encode(message: str) -> _Iterable[str]:
    """
    Encode message to UTF-8 then flag homoglyphs.

    Yield each encoded byte as a string.

    >>> list(encode('ba'))
    ['🇷🇴🇹🇩🇹🇩🇷🇴🇷🇴🇷🇴🇹🇩🇷🇴', '🇮🇩🇲🇨🇲🇨🇮🇩🇮🇩🇮🇩🇮🇩🇲🇨']
    """
    bytes_: bytes = message.encode('utf-8')
    for int_, (flag0, flag1) in zip(bytes_, _cycle(FLAG_PAIRS)):
        flag_encoding: _Dict[str, str] = {'0': flag0, '1': flag1}
        trans: _Dict[int, str] = str.maketrans(flag_encoding)
        flag_byte = f'{int_:08b}'.translate(trans)
        yield flag_byte


def decode(flag_encoded: _Iterable[str]) -> str:
    """
    Decode message from flag homoglyphs then UTF-8.

    I.e. do the opposite of `encode()`.

    >>> decode(encode('ba'))
    'ba'
    """
    message_ints: _List[int] = []
    for flag_byte, (flag0, flag1) in zip(flag_encoded, _cycle(FLAG_PAIRS)):
        flag_decoding: _Dict[str, str] = {flag0: '0', flag1: '1'}
        binary_repr: str = flag_byte
        for flag, bit in flag_decoding.items():
            binary_repr = binary_repr.replace(flag, bit)

        if binary_repr == flag_byte:
            # Nothing was replaced
            raise ValueError(f"Could not decode: {flag_byte!r}")

        int_: int = int(binary_repr, 2)
        message_ints.append(int_)
    message: str = bytes(message_ints).decode('utf-8')
    return message


def main() -> None:
    """Make question content and print."""
    # Make body
    body_message = 'You have solved this puzzle!'
    body = list(encode(body_message))
    print(*body, sep='\n')
    assert body_message == decode(body)

    print()

    # Make title - all Romania/Chad, which requires some special encode/decode
    title_message = 'TD'
    title = list(next(iter(encode(c))) for c in title_message)
    print(*title)
    assert title_message == ''.join(decode([c]) for c in title)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	Use Unicode flag emoji homoglyphs to re-encode UTF-8.

	To encode, a string is first encoded as UTF-8, then each byte is
	broken down into bits, and each bit is encoded as a flag:
	The more well-known country flag in each pair represents 0, and the
	less well-known one represents 1. Pairs are cycled through.

	There are 8 pairs total, which is the most I could find without repeats.
	(Norway has two homoglyphs, so I only included one.)

	Note that if any of these countries change their flags in the future,
	this'll stop working, because the glyphs aren't specified in Unicode.
	It's implementation-dependent what each REGIONAL INDICATOR SYMBOL
	sequence represents and what it looks like.

	Created for Puzzling Stack Exchange and posted there:
	https://puzzling.stackexchange.com/q/116097/63368
	"""

	from itertools import cycle as _cycle
	from typing import (
	Dict as _Dict,
	Iterable as _Iterable,
	List as _List,
	Tuple as _Tuple,
	)


	def _convert_country_code_to_ris(country_code: str) -> str:
	r"""
	Convert ASCII uppercase letters to REGIONAL INDICATOR SYMBOLs.

	>>> _convert_country_code_to_ris('CA')
	'\U0001f1e8\U0001f1e6'
	"""
	assert all('A' <= c <= 'Z' for c in country_code)
	shift: int = ord('\N{REGIONAL INDICATOR SYMBOL LETTER A}') - ord('A')
	return ''.join(chr(ord(c)+shift) for c in country_code)


	_COUNTRY_CODES: _List[_Tuple[str, str]] = [
	('RO', 'TD'), # Romania and Chad
	('ID', 'MC'), # Indonesia and Monaco - Not identical, but close enough
	('US', 'UM'), # USA and USA Minor Outlying Islands
	('NO', 'SJ'), # Norway and Svalbard & Jan Mayen
	('FR', 'MF'), # France and St. Martin
	('AU', 'HM'), # Australia and Heard & McDonald Islands
	('ES', 'EA'), # Spain and Ceuta & Melilla
	('FR', 'CP'), # France and Clipperton Island
	# ('NO', 'BV'), # Norway and Bouvet Island
	]

	FLAG_PAIRS: _List[_Tuple[str, str]] = [
	(_convert_country_code_to_ris(t0), _convert_country_code_to_ris(t1))
	for t0, t1 in _COUNTRY_CODES]


	def encode(message: str) -> _Iterable[str]:
	"""
	Encode message to UTF-8 then flag homoglyphs.

	Yield each encoded byte as a string.

	>>> list(encode('ba'))
	['🇷🇴🇹🇩🇹🇩🇷🇴🇷🇴🇷🇴🇹🇩🇷🇴', '🇮🇩🇲🇨🇲🇨🇮🇩🇮🇩🇮🇩🇮🇩🇲🇨']
	"""
	bytes_: bytes = message.encode('utf-8')
	for int_, (flag0, flag1) in zip(bytes_, _cycle(FLAG_PAIRS)):
	flag_encoding: _Dict[str, str] = {'0': flag0, '1': flag1}
	trans: _Dict[int, str] = str.maketrans(flag_encoding)
	flag_byte = f'{int_:08b}'.translate(trans)
	yield flag_byte


	def decode(flag_encoded: _Iterable[str]) -> str:
	"""
	Decode message from flag homoglyphs then UTF-8.

	I.e. do the opposite of `encode()`.

	>>> decode(encode('ba'))
	'ba'
	"""
	message_ints: _List[int] = []
	for flag_byte, (flag0, flag1) in zip(flag_encoded, _cycle(FLAG_PAIRS)):
	flag_decoding: _Dict[str, str] = {flag0: '0', flag1: '1'}
	binary_repr: str = flag_byte
	for flag, bit in flag_decoding.items():
	binary_repr = binary_repr.replace(flag, bit)

	if binary_repr == flag_byte:
	# Nothing was replaced
	raise ValueError(f"Could not decode: {flag_byte!r}")

	int_: int = int(binary_repr, 2)
	message_ints.append(int_)
	message: str = bytes(message_ints).decode('utf-8')
	return message


	def main() -> None:
	"""Make question content and print."""
	# Make body
	body_message = 'You have solved this puzzle!'
	body = list(encode(body_message))
	print(*body, sep='\n')
	assert body_message == decode(body)

	print()

	# Make title - all Romania/Chad, which requires some special encode/decode
	title_message = 'TD'
	title = list(next(iter(encode(c))) for c in title_message)
	print(*title)
	assert title_message == ''.join(decode([c]) for c in title)


	if __name__ == '__main__':
	main()