Skip to content

Instantly share code, notes, and snippets.

@wjandrea
Created May 11, 2022 21:03
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wjandrea/2ad5710bb2fd657739fcd585e0287d46 to your computer and use it in GitHub Desktop.
Save wjandrea/2ad5710bb2fd657739fcd585e0287d46 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Use Unicode flag emoji homoglyphs to re-encode UTF-8.
To encode, a string is first encoded as UTF-8, then each byte is
broken down into bits, and each bit is encoded as a flag:
The more well-known country flag in each pair represents 0, and the
less well-known one represents 1. Pairs are cycled through.
There are 8 pairs total, which is the most I could find without repeats.
(Norway has two homoglyphs, so I only included one.)
Note that if any of these countries change their flags in the future,
this'll stop working, because the glyphs aren't specified in Unicode.
It's implementation-dependent what each REGIONAL INDICATOR SYMBOL
sequence represents and what it looks like.
Created for Puzzling Stack Exchange and posted there:
https://puzzling.stackexchange.com/q/116097/63368
"""
from itertools import cycle as _cycle
from typing import (
Dict as _Dict,
Iterable as _Iterable,
List as _List,
Tuple as _Tuple,
)
def _convert_country_code_to_ris(country_code: str) -> str:
r"""
Convert ASCII uppercase letters to REGIONAL INDICATOR SYMBOLs.
>>> _convert_country_code_to_ris('CA')
'\U0001f1e8\U0001f1e6'
"""
assert all('A' <= c <= 'Z' for c in country_code)
shift: int = ord('\N{REGIONAL INDICATOR SYMBOL LETTER A}') - ord('A')
return ''.join(chr(ord(c)+shift) for c in country_code)
_COUNTRY_CODES: _List[_Tuple[str, str]] = [
('RO', 'TD'), # Romania and Chad
('ID', 'MC'), # Indonesia and Monaco - Not identical, but close enough
('US', 'UM'), # USA and USA Minor Outlying Islands
('NO', 'SJ'), # Norway and Svalbard & Jan Mayen
('FR', 'MF'), # France and St. Martin
('AU', 'HM'), # Australia and Heard & McDonald Islands
('ES', 'EA'), # Spain and Ceuta & Melilla
('FR', 'CP'), # France and Clipperton Island
# ('NO', 'BV'), # Norway and Bouvet Island
]
FLAG_PAIRS: _List[_Tuple[str, str]] = [
(_convert_country_code_to_ris(t0), _convert_country_code_to_ris(t1))
for t0, t1 in _COUNTRY_CODES]
def encode(message: str) -> _Iterable[str]:
"""
Encode message to UTF-8 then flag homoglyphs.
Yield each encoded byte as a string.
>>> list(encode('ba'))
['๐Ÿ‡ท๐Ÿ‡ด๐Ÿ‡น๐Ÿ‡ฉ๐Ÿ‡น๐Ÿ‡ฉ๐Ÿ‡ท๐Ÿ‡ด๐Ÿ‡ท๐Ÿ‡ด๐Ÿ‡ท๐Ÿ‡ด๐Ÿ‡น๐Ÿ‡ฉ๐Ÿ‡ท๐Ÿ‡ด', '๐Ÿ‡ฎ๐Ÿ‡ฉ๐Ÿ‡ฒ๐Ÿ‡จ๐Ÿ‡ฒ๐Ÿ‡จ๐Ÿ‡ฎ๐Ÿ‡ฉ๐Ÿ‡ฎ๐Ÿ‡ฉ๐Ÿ‡ฎ๐Ÿ‡ฉ๐Ÿ‡ฎ๐Ÿ‡ฉ๐Ÿ‡ฒ๐Ÿ‡จ']
"""
bytes_: bytes = message.encode('utf-8')
for int_, (flag0, flag1) in zip(bytes_, _cycle(FLAG_PAIRS)):
flag_encoding: _Dict[str, str] = {'0': flag0, '1': flag1}
trans: _Dict[int, str] = str.maketrans(flag_encoding)
flag_byte = f'{int_:08b}'.translate(trans)
yield flag_byte
def decode(flag_encoded: _Iterable[str]) -> str:
"""
Decode message from flag homoglyphs then UTF-8.
I.e. do the opposite of `encode()`.
>>> decode(encode('ba'))
'ba'
"""
message_ints: _List[int] = []
for flag_byte, (flag0, flag1) in zip(flag_encoded, _cycle(FLAG_PAIRS)):
flag_decoding: _Dict[str, str] = {flag0: '0', flag1: '1'}
binary_repr: str = flag_byte
for flag, bit in flag_decoding.items():
binary_repr = binary_repr.replace(flag, bit)
if binary_repr == flag_byte:
# Nothing was replaced
raise ValueError(f"Could not decode: {flag_byte!r}")
int_: int = int(binary_repr, 2)
message_ints.append(int_)
message: str = bytes(message_ints).decode('utf-8')
return message
def main() -> None:
"""Make question content and print."""
# Make body
body_message = 'You have solved this puzzle!'
body = list(encode(body_message))
print(*body, sep='\n')
assert body_message == decode(body)
print()
# Make title - all Romania/Chad, which requires some special encode/decode
title_message = 'TD'
title = list(next(iter(encode(c))) for c in title_message)
print(*title)
assert title_message == ''.join(decode([c]) for c in title)
if __name__ == '__main__':
main()
@wjandrea
Copy link
Author

BTW, if this seems over-engineered, it's cause I'm practicing writing rigorous Python (typing, assertions, documentation, etc).

@t-leclercq
Copy link

BTW, if this seems over-engineered, it's cause I'm practicing writing rigorous Python (typing, assertions, documentation, etc).

Please don't justify yourself, thank you for this funny encoder ;)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment