Last active
July 25, 2023 21:21
-
-
Save serif/68fa1b389e90072d9c1b377de123a92d to your computer and use it in GitHub Desktop.
Morse code efficiency vs 5-bit ASCII
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from typing import Dict, List | |
from os import path | |
from urllib.request import urlopen | |
from string import ascii_uppercase | |
# from itertools import product | |
# from collections import Counter | |
def main(): | |
t: Translator = Translator() | |
print('\nGenerating corpus data') | |
x: Corpus = Corpus() | |
# Show dict contents | |
print(f'{t.d_morse=}') | |
print(f'{t.d_morse_bin=}') | |
print() | |
# Most common letters | |
t.header() | |
for c in 'ETAOINSHR': | |
t.test(c, True) | |
print() | |
# Least common letters | |
t.header() | |
for c in 'JXQZ': | |
t.test(c, True) | |
print() | |
# Text test 1 | |
t.test('HELLO WORLD.') | |
# Text test 2, Article 1 of the Universal Declaration of Human Rights | |
a1: str = 'All human beings are born free and equal in dignity and rights. ' | |
a1 += 'They are endowed with reason and conscience ' | |
a1 += 'and should act towards one another in a spirit of brotherhood.' | |
a1 = a1.upper() | |
t.test(a1) | |
t.fast_test(x.freq) | |
# Bit width comparison | |
t.test_widths(a1) | |
# Generate naively optimized Morse | |
print('\n-- Optimized by Wikipedia\'s letter frequency\n') | |
g: Generator = Generator() | |
t.d_morse = g.morse | |
t.gen_morse_bin() | |
t.test(a1) | |
t.fast_test(x.freq) | |
# Generate better optimized Morse | |
print('\n-- Optimized by frequency of corpus with CW abbreviations\n') | |
g: Generator = Generator(etaoin=''.join(x.freq.keys())) | |
t.d_morse = g.morse | |
t.gen_morse_bin() | |
t.test(a1) | |
t.fast_test(x.freq) | |
class Translator: | |
d_morse: Dict[str, str] | |
d_morse_bin: Dict[str, str] | |
def __init__(self): | |
# Dict: Morse | |
self.d_morse = { | |
'A': '.–', | |
'B': '–...', | |
'C': '–.–.', | |
'D': '–..', | |
'E': '.', | |
'F': '..–.', | |
'G': '––.', | |
'H': '....', | |
'I': '..', | |
'J': '.–––', | |
'K': '–.–', | |
'L': '.–..', | |
'M': '––', | |
'N': '–.', | |
'O': '–––', | |
'P': '.––.', | |
'Q': '––.–', | |
'R': '.–.', | |
'S': '...', | |
'T': '–', | |
'U': '..–', | |
'V': '...–', | |
'W': '.––', | |
'X': '–..–', | |
'Y': '–.––', | |
'Z': '––..', | |
'1': '.––––', | |
'2': '..–––', | |
'3': '...––', | |
'4': '....–', | |
'5': '.....', | |
'6': '–....', | |
'7': '––...', | |
'8': '–––..', | |
'9': '––––.', | |
'0': '–––––', | |
'.': '.–.–.–', | |
',': '––..––', | |
'?': '..––..', | |
'\'': '.––––.', | |
'!': '–.–.––', | |
'/': '–..–.', | |
'(': '–.––.', | |
')': '–.––.–', | |
'&': '.–...', | |
':': '–––...', | |
';': '–.–.–.', | |
'=': '–...–', | |
'+': '.–.–.', | |
'-': '–....–', | |
'_': '..––.–', | |
'"': '.–..–.', | |
'$': '...–..–', | |
'@': '.––.–.', | |
'À': '.––.–', | |
'Ä': '.–.–', | |
'Ć': '–.–..', | |
'Š': '––––', | |
'Ð': '..––.', | |
'È': '.–..–', | |
'É': '.–..–', | |
'Ĝ': '––.–.', | |
'Ĵ': '.–––.', | |
'Ñ': '––.––', | |
'Ó': '–––.', | |
'Ś': '...–...', | |
'Ŝ': '...–.', | |
'Þ': '.––..', | |
'Ü': '..––', | |
'Ź': '––..–.', | |
'Ż': '––..–', | |
' ': ' ', | |
} | |
self.gen_morse_bin() | |
def gen_morse_bin(self): | |
# Dict: Morse Bin | |
self.d_morse_bin = {} | |
for k in self.d_morse.keys(): | |
morse: str = self.d_morse[k] | |
bin_str: str = '' | |
for c in morse: | |
if c == ' ': | |
bin_str += '00' | |
elif c == '.': | |
bin_str += '10' | |
elif c == '–': | |
bin_str += '1110' | |
else: | |
exit(f'unexpected char "{c}" in d_morse["{k}"]') | |
bin_str += '00' | |
self.d_morse_bin[k] = bin_str | |
def to_morse(self, abc: str, space: bool = True) -> str: | |
out: str = '' | |
for c in abc: | |
if out in self.d_morse: | |
out += self.d_morse[c] | |
if space: | |
out += ' ' | |
return out | |
def to_morse_bin(self, abc: str, space: bool = True) -> str: | |
out: str = '' | |
for c in abc: | |
out += self.d_morse_bin[c] | |
if space: | |
out += ' ' | |
return out | |
@staticmethod | |
def to_bin(abc: str, bits: int = 5, space: bool = True) -> str: | |
if not 0 < bits < 9: | |
exit(f'invalid bit width: {bits}') | |
out: str = '' | |
for c in abc: | |
if c == ' ': | |
out += '0' * bits | |
elif c == '.': | |
out += '0' * (bits-1) + '1' | |
else: | |
ascii_val = ord(c) | |
bin_val = bin(ascii_val)[2:].zfill(8)[8-bits:] | |
out += bin_val | |
if space: | |
out += ' ' | |
return out | |
@staticmethod | |
def header() -> None: | |
print('Letter | Morse | Morse binary | Morse duration | 5-ASCII binary') | |
print('---|---|----|----|----') | |
def test(self, text: str, one_line: bool = False) -> None: | |
morse_str: str = self.to_morse(text) | |
morse_bin_str: str = self.to_morse_bin(text) | |
morse_bin_len: str = self.bit_count(morse_bin_str) | |
ascii_str: str = self.to_bin(text) | |
ascii_str_len = self.bit_count(ascii_str) | |
if one_line: | |
line: str = ' | '.join([text, morse_str, morse_bin_str, morse_bin_len, ascii_str]) | |
else: | |
line: str = f'"{text}"\n\n* **Morse:** {morse_str}\n* **Morse binary, ' | |
line += f'{morse_bin_len} bits:** {morse_bin_str}\n' | |
line += f'* **ASCII, {ascii_str_len} bits:** {ascii_str}\n' | |
print(line) | |
def fast_test(self, freq: Dict[str, int]) -> None: | |
count: int = 0 | |
for k, v in freq.items(): | |
if k.strip() == '': | |
continue | |
count += int(self.bit_count(self.to_morse_bin(k))) * v | |
print(f'Library test, {count:,} bits') | |
def test_widths(self, text: str) -> None: | |
out: str = '' | |
for width in [5, 6, 8]: | |
ascii_str = self.to_bin(text, bits=width) | |
length = self.bit_count(ascii_str) | |
out += f'\n* {width}-ASCII: {length} bits' | |
morse_bin_str: str = self.to_morse_bin(text) | |
morse_bin_len: str = self.bit_count(morse_bin_str) | |
out += f'\n* Morse : {morse_bin_len} bits' | |
print(out) | |
@staticmethod | |
def bit_count(stream) -> str: | |
return str(len(stream.replace(' ', ''))) | |
class Generator: | |
slots: Dict[str, int] = {} | |
morse: Dict[str, str] = {} | |
etaoin: str = '' | |
def __init__(self, etaoin: str = 'ETAOINSHRDLCUMWFGYPBVKJXQZ1234567890.,?!/ÀÄĆŠÉÈŃÓ'): | |
self.etaoin = etaoin | |
self.create_sorted_slots() | |
self.fill_slots() | |
def create_sorted_slots(self) -> None: | |
# cx = [''.join(c).replace(' ', '') for c in product(' 24', repeat=5)] | |
# print(f'{cx=}') | |
# return | |
# for i, _ in enumerate(cx): | |
# cx[i] = cx[i].replace(' ', '') | |
# for x in cx: | |
# print(x) | |
# return | |
for c1 in [0, 2, 4]: | |
for c2 in [0, 2, 4]: | |
for c3 in [0, 2, 4]: | |
for c4 in [0, 2, 4]: | |
for c5 in [0, 2, 4]: | |
out: str = ''.join( | |
[str(c) for c in [c1, c2, c3, c4, c5] if c != 0]) | |
if out == '': | |
continue | |
duration: int = sum(int(c) for c in out) * 10 | |
duration += sum([1 for c in out if c == '2']) | |
out = out.replace('2', '.').replace('4', '–') | |
self.slots[out] = duration | |
self.slots = dict(sorted(self.slots.items(), key=lambda x: x[1])) | |
# for k, v in self.slots.items(): | |
# print(str(k).ljust(8), v) | |
# print(len(self.slots)) | |
def fill_slots(self): | |
neo: List[str] = list(self.slots.keys()) | |
for i, c in enumerate(self.etaoin): | |
self.morse[c] = neo[i] | |
# print(c, neo[i]) | |
self.morse[' '] = ' ' | |
class Corpus: | |
text: str = '' | |
freq: Dict[str, int] = {} | |
def __init__(self) -> None: | |
if path.isfile('corpus'): | |
self.read() | |
self.shorten() | |
else: | |
self.download() | |
self.shorten() | |
self.write() | |
self.count() | |
def download(self): | |
prefix: str = 'https://www.gutenberg.org/' | |
urls: Dict[str, str] = { | |
'Dracula': 'cache/epub/345/pg345.txt', | |
'Frankenstein': 'cache/epub/84/pg84.txt', | |
'Moby Dick': 'files/2701/2701-0.txt', | |
'Alice in Wonderland': 'cache/epub/11/pg11.txt' | |
} | |
for k, v in urls.items(): | |
print('Downloading', k) | |
self.text += urlopen(prefix+v).read().decode('utf-8').upper() | |
def shorten(self): | |
abbrev: Dict[str, str] = { | |
'ADDRESS': 'ADRS', | |
'AGAIN': 'AGN', | |
'ANTENNA': 'ANT', | |
'BETTER': 'BTR', | |
'CALLED': 'CLD', | |
'CALLING': 'CLG', | |
'CAN\'T': 'CNT', | |
'COME': 'CUM', | |
'CONDITION': 'CONDX', | |
'CONDITIONS': 'CONDX', | |
'YES': 'C', | |
'CORRECT': 'C', | |
'AFFIRMATIVE': 'C', | |
'CONFIRM': 'CFM', | |
'CHECK': 'CK', | |
'SEE YOU LATER': 'CUL', | |
'SEE YOU': 'CU', | |
'DAY': 'DA', | |
'DELIVERED': 'DLVD', | |
'DIFFERENCE': 'DIFF', | |
'DIFFERENT': 'DIFF', | |
'DOWN': 'DN', | |
'DEAR': 'DR', | |
'FROM': 'DE', | |
'LONG DISTANCE': 'DX', | |
'LONG': 'LNG', | |
'DISTANCE': 'DX', | |
'DISTANT': 'DX', | |
'FOREIGN': 'DX', | |
'ELEMENT': 'EL', | |
'FOR': 'FER', | |
'FINE BUSINESS': 'FB', | |
'EXCELLENT': 'FB', | |
'WONDERFUL': 'FB', | |
'GUESS': 'GESS', | |
'GOOD AFTERNOON': 'GA', | |
'GOOD EVENING': 'GE', | |
'GOOD MORNING': 'GM', | |
'GOOD NIGHT': 'GN', | |
'GOODNIGHT': 'GN', | |
'GOOD': 'GD', | |
'GOING': 'GG', | |
'GIVE': 'GV', | |
'GIVING': 'GVG', | |
'HERE': 'HR', | |
'HEAR': 'HR', | |
'HOPE': 'HP', | |
'LEAVE': 'LV', | |
'LEAVING': 'LVG', | |
'MESSAGE': 'MSG', | |
'MY NAME': 'MN', | |
'NO MORE': 'NM', | |
'NO': 'N', | |
'NOTHING': 'NIL', | |
'NOW': 'NW', | |
'NUMBER': 'NR', | |
'OKAY': 'OK', | |
'OLD BOY': 'OB', | |
'OLD CHAP': 'OC', | |
'OLD MAN': 'OM', | |
'OPERATOR': 'OP', | |
'HUSBAND': 'OM', | |
'PACKAGE': 'PKG', | |
'PLEASE': 'PLS', | |
'PAPER': 'PPR', | |
'PREFIX': 'PX', | |
'PRESS': 'PX', | |
'POWER': 'PWR', | |
'REFER TO': 'RFR', | |
'REFERRING TO': 'RFR', | |
'REGARDING': 'RE', | |
'CONCERNING': 'RE', | |
'I AM IN': 'QTH', | |
'RECEIVED': 'R', | |
'RECEIVE': 'RX', | |
'RECEIVER': 'RX', | |
'REPORT': 'RPT', | |
'REPEAT': 'RPT', | |
'SAID': 'SED', | |
'SAYS': 'SEZ', | |
'SIGNED': 'SGD', | |
'SERVICE': 'SVC', | |
'SO FAR': 'SFR', | |
'SIGNATURE': 'SIG', | |
'SIGNAL': 'SIG', | |
'DEAD': 'SK', | |
'DECEASED': 'SK', | |
'GHOST': 'SK', | |
'SORRY': 'SRI', | |
'STATION': 'STN', | |
'SOME': 'SUM', | |
'THAT': 'TT', | |
'THANK YOU': 'TU', | |
'THANKS': 'TU', | |
'TRANSMIT': 'TX', | |
'TRANSMITTER': 'TX', | |
'TRAFFIC': 'TFC', | |
'TEXT': 'TXT', | |
'TOMORROW': 'TMW', | |
'TRICKS': 'TRIX', | |
'THAT IS': 'TTS', | |
'WORD': 'W', | |
'WORDS': 'W', | |
'WORD AFTER': 'WA', | |
'WORD BEFORE': 'WB', | |
'TRANSCEIVER': 'XCVR', | |
'WELL': 'WL', | |
'WILL': 'WL', | |
'WITH': 'WID', | |
'WIFE': 'XYL', | |
'WORKED': 'WKD', | |
'WORKING': 'WKG', | |
'WOULD': 'WUD', | |
'YOURS': 'URS', | |
'WOMAN': 'YL', | |
'GIRL': 'YL', | |
'YEAR': 'YR', | |
'YEARS': 'YRS', | |
'YOU': 'U', | |
'YOUR': 'UR', | |
'YOU\'RE': 'UR', | |
'LATER': 'LTR', | |
'LETTER': 'LTR', | |
'GOODBYE': '73', | |
'FAREWELL': '73', | |
} | |
merge: Dict[str, str] = { | |
'[': '(', | |
']': ')', | |
'{': '(', | |
'}': ')', | |
'_': ' ', | |
'“': '"', | |
'”': '"', | |
'‘': '\'', | |
'’': '\'', | |
'×': 'x', | |
'*': '', | |
'\\': '/', | |
';': ':', | |
'—': '-', | |
'<': '(', | |
'>': ')', | |
'$': '', | |
'£': '', | |
'%': '0/0', | |
'Å': 'À', | |
'Ą': 'Ä', | |
'Æ': 'Ä', | |
'Ĉ': 'Ć', | |
'Ç': 'Ć', | |
'Ĥ': 'Š', | |
'Ę': 'É', | |
'Ł': 'È', | |
'Ñ': 'Ń', | |
'Ö': 'Ó', | |
'Ø': 'Ó', | |
} | |
print('Abbreviating words') | |
for k, v in abbrev.items(): | |
replace = v | |
find = f'\\b{k}\\b' | |
self.text = re.sub(find, replace, self.text) | |
print('Merging characters') | |
for k, v in merge.items(): | |
find, replace = k, v | |
self.text = self.text.replace(find, replace) | |
def write(self) -> None: | |
with open('corpus', 'w') as f: | |
f.write(self.text) | |
def read(self) -> None: | |
with open('corpus', 'r') as f: | |
self.text = f.read() | |
def count(self) -> None: | |
valid: str = ascii_uppercase + '1234567890' + 'ÀÄĆŠÉÈŃÓ' | |
counts: Dict[str, int] = {} | |
for line in self.text: | |
for c in line: | |
if c in valid: | |
if c in counts.keys(): | |
counts[c] += 1 | |
else: | |
counts[c] = 1 | |
self.freq = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment