Skip to content

Instantly share code, notes, and snippets.

@idanarye
Last active March 14, 2021 18:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save idanarye/fb75e5f813ddbff7d664204607c20321 to your computer and use it in GitHub Desktop.
Save idanarye/fb75e5f813ddbff7d664204607c20321 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
import sys
import math
import requests
if len(sys.argv) <= 1:
print('Please supply, as argument, the URL to one of the files from https://www.galbithink.org/names/us200.htm', file=sys.stderr)
sys.exit(1)
r = requests.get(sys.argv[1])
r.raise_for_status()
data = r.text
it = iter(data.splitlines())
for line in it:
if not line:
break
names_occurences = {}
for line in it:
name, occurences = line.split(',', 1)
names_occurences[name] = int(occurences)
total_occurences = sum(names_occurences.values())
names_probabilities = {
name: occurences / total_occurences
for name, occurences in names_occurences.items()
}
names_entropies = {
name: - p * math.log2(p)
for name, p in names_probabilities.items()
}
total_entropy = sum(names_entropies.values())
average_name_length = sum(len(name) for name in names_entropies.keys()) / len(names_entropies)
entropy_per_letter = total_entropy / average_name_length
print(f'Entropy per letter: {entropy_per_letter}')
print()
rarest_name_occurences = min(names_occurences.values())
rarest_names = [
name
for name, occurences in names_occurences.items()
if occurences == rarest_name_occurences]
rarest_name_probability = min(names_probabilities.values())
print(f'Any of the {len(rarest_names)} rarest name are 1:{1 / rarest_name_probability}')
bits_for_rarest_name = math.log2(1 / rarest_name_probability)
print(f'Bits for rarest name: {bits_for_rarest_name}')
print(f'Rarest name needs to be {math.ceil(bits_for_rarest_name / entropy_per_letter)} letters long')
rarest_names_lengths = [len(name) for name in rarest_names]
print('Rarest names are between', min(rarest_names_lengths), 'and', max(rarest_names_lengths), 'letters long')
for i, frequent_name in enumerate(sorted(names_occurences, key=names_occurences.__getitem__, reverse=True)[:5], start=1):
print()
print(f'#{i} Most frequent name is {frequent_name}, which is {len(frequent_name)} letters long')
bits_for_frequent_name = math.log2(1 / names_probabilities[frequent_name])
print(f'{frequent_name} is worth {bits_for_frequent_name} bits')
print(f'{frequent_name} would needs to be', math.ceil(bits_for_frequent_name / entropy_per_letter), 'letters long')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment