Skip to content

Instantly share code, notes, and snippets.

@serif
Last active July 25, 2023 21:21
Show Gist options
  • Save serif/68fa1b389e90072d9c1b377de123a92d to your computer and use it in GitHub Desktop.
Save serif/68fa1b389e90072d9c1b377de123a92d to your computer and use it in GitHub Desktop.
Morse code efficiency vs 5-bit ASCII
#!/usr/bin/env python3
import re
from typing import Dict, List
from os import path
from urllib.request import urlopen
from string import ascii_uppercase
# from itertools import product
# from collections import Counter
def main():
t: Translator = Translator()
print('\nGenerating corpus data')
x: Corpus = Corpus()
# Show dict contents
print(f'{t.d_morse=}')
print(f'{t.d_morse_bin=}')
print()
# Most common letters
t.header()
for c in 'ETAOINSHR':
t.test(c, True)
print()
# Least common letters
t.header()
for c in 'JXQZ':
t.test(c, True)
print()
# Text test 1
t.test('HELLO WORLD.')
# Text test 2, Article 1 of the Universal Declaration of Human Rights
a1: str = 'All human beings are born free and equal in dignity and rights. '
a1 += 'They are endowed with reason and conscience '
a1 += 'and should act towards one another in a spirit of brotherhood.'
a1 = a1.upper()
t.test(a1)
t.fast_test(x.freq)
# Bit width comparison
t.test_widths(a1)
# Generate naively optimized Morse
print('\n-- Optimized by Wikipedia\'s letter frequency\n')
g: Generator = Generator()
t.d_morse = g.morse
t.gen_morse_bin()
t.test(a1)
t.fast_test(x.freq)
# Generate better optimized Morse
print('\n-- Optimized by frequency of corpus with CW abbreviations\n')
g: Generator = Generator(etaoin=''.join(x.freq.keys()))
t.d_morse = g.morse
t.gen_morse_bin()
t.test(a1)
t.fast_test(x.freq)
class Translator:
d_morse: Dict[str, str]
d_morse_bin: Dict[str, str]
def __init__(self):
# Dict: Morse
self.d_morse = {
'A': '.–',
'B': '–...',
'C': '–.–.',
'D': '–..',
'E': '.',
'F': '..–.',
'G': '––.',
'H': '....',
'I': '..',
'J': '.–––',
'K': '–.–',
'L': '.–..',
'M': '––',
'N': '–.',
'O': '–––',
'P': '.––.',
'Q': '––.–',
'R': '.–.',
'S': '...',
'T': '–',
'U': '..–',
'V': '...–',
'W': '.––',
'X': '–..–',
'Y': '–.––',
'Z': '––..',
'1': '.––––',
'2': '..–––',
'3': '...––',
'4': '....–',
'5': '.....',
'6': '–....',
'7': '––...',
'8': '–––..',
'9': '––––.',
'0': '–––––',
'.': '.–.–.–',
',': '––..––',
'?': '..––..',
'\'': '.––––.',
'!': '–.–.––',
'/': '–..–.',
'(': '–.––.',
')': '–.––.–',
'&': '.–...',
':': '–––...',
';': '–.–.–.',
'=': '–...–',
'+': '.–.–.',
'-': '–....–',
'_': '..––.–',
'"': '.–..–.',
'$': '...–..–',
'@': '.––.–.',
'À': '.––.–',
'Ä': '.–.–',
'Ć': '–.–..',
'Š': '––––',
'Ð': '..––.',
'È': '.–..–',
'É': '.–..–',
'Ĝ': '––.–.',
'Ĵ': '.–––.',
'Ñ': '––.––',
'Ó': '–––.',
'Ś': '...–...',
'Ŝ': '...–.',
'Þ': '.––..',
'Ü': '..––',
'Ź': '––..–.',
'Ż': '––..–',
' ': ' ',
}
self.gen_morse_bin()
def gen_morse_bin(self):
# Dict: Morse Bin
self.d_morse_bin = {}
for k in self.d_morse.keys():
morse: str = self.d_morse[k]
bin_str: str = ''
for c in morse:
if c == ' ':
bin_str += '00'
elif c == '.':
bin_str += '10'
elif c == '–':
bin_str += '1110'
else:
exit(f'unexpected char "{c}" in d_morse["{k}"]')
bin_str += '00'
self.d_morse_bin[k] = bin_str
def to_morse(self, abc: str, space: bool = True) -> str:
out: str = ''
for c in abc:
if out in self.d_morse:
out += self.d_morse[c]
if space:
out += ' '
return out
def to_morse_bin(self, abc: str, space: bool = True) -> str:
out: str = ''
for c in abc:
out += self.d_morse_bin[c]
if space:
out += ' '
return out
@staticmethod
def to_bin(abc: str, bits: int = 5, space: bool = True) -> str:
if not 0 < bits < 9:
exit(f'invalid bit width: {bits}')
out: str = ''
for c in abc:
if c == ' ':
out += '0' * bits
elif c == '.':
out += '0' * (bits-1) + '1'
else:
ascii_val = ord(c)
bin_val = bin(ascii_val)[2:].zfill(8)[8-bits:]
out += bin_val
if space:
out += ' '
return out
@staticmethod
def header() -> None:
print('Letter | Morse | Morse binary | Morse duration | 5-ASCII binary')
print('---|---|----|----|----')
def test(self, text: str, one_line: bool = False) -> None:
morse_str: str = self.to_morse(text)
morse_bin_str: str = self.to_morse_bin(text)
morse_bin_len: str = self.bit_count(morse_bin_str)
ascii_str: str = self.to_bin(text)
ascii_str_len = self.bit_count(ascii_str)
if one_line:
line: str = ' | '.join([text, morse_str, morse_bin_str, morse_bin_len, ascii_str])
else:
line: str = f'"{text}"\n\n* **Morse:** {morse_str}\n* **Morse binary, '
line += f'{morse_bin_len} bits:** {morse_bin_str}\n'
line += f'* **ASCII, {ascii_str_len} bits:** {ascii_str}\n'
print(line)
def fast_test(self, freq: Dict[str, int]) -> None:
count: int = 0
for k, v in freq.items():
if k.strip() == '':
continue
count += int(self.bit_count(self.to_morse_bin(k))) * v
print(f'Library test, {count:,} bits')
def test_widths(self, text: str) -> None:
out: str = ''
for width in [5, 6, 8]:
ascii_str = self.to_bin(text, bits=width)
length = self.bit_count(ascii_str)
out += f'\n* {width}-ASCII: {length} bits'
morse_bin_str: str = self.to_morse_bin(text)
morse_bin_len: str = self.bit_count(morse_bin_str)
out += f'\n* Morse : {morse_bin_len} bits'
print(out)
@staticmethod
def bit_count(stream) -> str:
return str(len(stream.replace(' ', '')))
class Generator:
slots: Dict[str, int] = {}
morse: Dict[str, str] = {}
etaoin: str = ''
def __init__(self, etaoin: str = 'ETAOINSHRDLCUMWFGYPBVKJXQZ1234567890.,?!/ÀÄĆŠÉÈŃÓ'):
self.etaoin = etaoin
self.create_sorted_slots()
self.fill_slots()
def create_sorted_slots(self) -> None:
# cx = [''.join(c).replace(' ', '') for c in product(' 24', repeat=5)]
# print(f'{cx=}')
# return
# for i, _ in enumerate(cx):
# cx[i] = cx[i].replace(' ', '')
# for x in cx:
# print(x)
# return
for c1 in [0, 2, 4]:
for c2 in [0, 2, 4]:
for c3 in [0, 2, 4]:
for c4 in [0, 2, 4]:
for c5 in [0, 2, 4]:
out: str = ''.join(
[str(c) for c in [c1, c2, c3, c4, c5] if c != 0])
if out == '':
continue
duration: int = sum(int(c) for c in out) * 10
duration += sum([1 for c in out if c == '2'])
out = out.replace('2', '.').replace('4', '–')
self.slots[out] = duration
self.slots = dict(sorted(self.slots.items(), key=lambda x: x[1]))
# for k, v in self.slots.items():
# print(str(k).ljust(8), v)
# print(len(self.slots))
def fill_slots(self):
neo: List[str] = list(self.slots.keys())
for i, c in enumerate(self.etaoin):
self.morse[c] = neo[i]
# print(c, neo[i])
self.morse[' '] = ' '
class Corpus:
text: str = ''
freq: Dict[str, int] = {}
def __init__(self) -> None:
if path.isfile('corpus'):
self.read()
self.shorten()
else:
self.download()
self.shorten()
self.write()
self.count()
def download(self):
prefix: str = 'https://www.gutenberg.org/'
urls: Dict[str, str] = {
'Dracula': 'cache/epub/345/pg345.txt',
'Frankenstein': 'cache/epub/84/pg84.txt',
'Moby Dick': 'files/2701/2701-0.txt',
'Alice in Wonderland': 'cache/epub/11/pg11.txt'
}
for k, v in urls.items():
print('Downloading', k)
self.text += urlopen(prefix+v).read().decode('utf-8').upper()
def shorten(self):
abbrev: Dict[str, str] = {
'ADDRESS': 'ADRS',
'AGAIN': 'AGN',
'ANTENNA': 'ANT',
'BETTER': 'BTR',
'CALLED': 'CLD',
'CALLING': 'CLG',
'CAN\'T': 'CNT',
'COME': 'CUM',
'CONDITION': 'CONDX',
'CONDITIONS': 'CONDX',
'YES': 'C',
'CORRECT': 'C',
'AFFIRMATIVE': 'C',
'CONFIRM': 'CFM',
'CHECK': 'CK',
'SEE YOU LATER': 'CUL',
'SEE YOU': 'CU',
'DAY': 'DA',
'DELIVERED': 'DLVD',
'DIFFERENCE': 'DIFF',
'DIFFERENT': 'DIFF',
'DOWN': 'DN',
'DEAR': 'DR',
'FROM': 'DE',
'LONG DISTANCE': 'DX',
'LONG': 'LNG',
'DISTANCE': 'DX',
'DISTANT': 'DX',
'FOREIGN': 'DX',
'ELEMENT': 'EL',
'FOR': 'FER',
'FINE BUSINESS': 'FB',
'EXCELLENT': 'FB',
'WONDERFUL': 'FB',
'GUESS': 'GESS',
'GOOD AFTERNOON': 'GA',
'GOOD EVENING': 'GE',
'GOOD MORNING': 'GM',
'GOOD NIGHT': 'GN',
'GOODNIGHT': 'GN',
'GOOD': 'GD',
'GOING': 'GG',
'GIVE': 'GV',
'GIVING': 'GVG',
'HERE': 'HR',
'HEAR': 'HR',
'HOPE': 'HP',
'LEAVE': 'LV',
'LEAVING': 'LVG',
'MESSAGE': 'MSG',
'MY NAME': 'MN',
'NO MORE': 'NM',
'NO': 'N',
'NOTHING': 'NIL',
'NOW': 'NW',
'NUMBER': 'NR',
'OKAY': 'OK',
'OLD BOY': 'OB',
'OLD CHAP': 'OC',
'OLD MAN': 'OM',
'OPERATOR': 'OP',
'HUSBAND': 'OM',
'PACKAGE': 'PKG',
'PLEASE': 'PLS',
'PAPER': 'PPR',
'PREFIX': 'PX',
'PRESS': 'PX',
'POWER': 'PWR',
'REFER TO': 'RFR',
'REFERRING TO': 'RFR',
'REGARDING': 'RE',
'CONCERNING': 'RE',
'I AM IN': 'QTH',
'RECEIVED': 'R',
'RECEIVE': 'RX',
'RECEIVER': 'RX',
'REPORT': 'RPT',
'REPEAT': 'RPT',
'SAID': 'SED',
'SAYS': 'SEZ',
'SIGNED': 'SGD',
'SERVICE': 'SVC',
'SO FAR': 'SFR',
'SIGNATURE': 'SIG',
'SIGNAL': 'SIG',
'DEAD': 'SK',
'DECEASED': 'SK',
'GHOST': 'SK',
'SORRY': 'SRI',
'STATION': 'STN',
'SOME': 'SUM',
'THAT': 'TT',
'THANK YOU': 'TU',
'THANKS': 'TU',
'TRANSMIT': 'TX',
'TRANSMITTER': 'TX',
'TRAFFIC': 'TFC',
'TEXT': 'TXT',
'TOMORROW': 'TMW',
'TRICKS': 'TRIX',
'THAT IS': 'TTS',
'WORD': 'W',
'WORDS': 'W',
'WORD AFTER': 'WA',
'WORD BEFORE': 'WB',
'TRANSCEIVER': 'XCVR',
'WELL': 'WL',
'WILL': 'WL',
'WITH': 'WID',
'WIFE': 'XYL',
'WORKED': 'WKD',
'WORKING': 'WKG',
'WOULD': 'WUD',
'YOURS': 'URS',
'WOMAN': 'YL',
'GIRL': 'YL',
'YEAR': 'YR',
'YEARS': 'YRS',
'YOU': 'U',
'YOUR': 'UR',
'YOU\'RE': 'UR',
'LATER': 'LTR',
'LETTER': 'LTR',
'GOODBYE': '73',
'FAREWELL': '73',
}
merge: Dict[str, str] = {
'[': '(',
']': ')',
'{': '(',
'}': ')',
'_': ' ',
'“': '"',
'”': '"',
'‘': '\'',
'’': '\'',
'×': 'x',
'*': '',
'\\': '/',
';': ':',
'—': '-',
'<': '(',
'>': ')',
'$': '',
'£': '',
'%': '0/0',
'Å': 'À',
'Ą': 'Ä',
'Æ': 'Ä',
'Ĉ': 'Ć',
'Ç': 'Ć',
'Ĥ': 'Š',
'Ę': 'É',
'Ł': 'È',
'Ñ': 'Ń',
'Ö': 'Ó',
'Ø': 'Ó',
}
print('Abbreviating words')
for k, v in abbrev.items():
replace = v
find = f'\\b{k}\\b'
self.text = re.sub(find, replace, self.text)
print('Merging characters')
for k, v in merge.items():
find, replace = k, v
self.text = self.text.replace(find, replace)
def write(self) -> None:
with open('corpus', 'w') as f:
f.write(self.text)
def read(self) -> None:
with open('corpus', 'r') as f:
self.text = f.read()
def count(self) -> None:
valid: str = ascii_uppercase + '1234567890' + 'ÀÄĆŠÉÈŃÓ'
counts: Dict[str, int] = {}
for line in self.text:
for c in line:
if c in valid:
if c in counts.keys():
counts[c] += 1
else:
counts[c] = 1
self.freq = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment