Skip to content

Instantly share code, notes, and snippets.

@rtpg
Created June 22, 2020 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rtpg/8dabf0ceb0fc807ae80510a7989ef709 to your computer and use it in GitHub Desktop.
Save rtpg/8dabf0ceb0fc807ae80510a7989ef709 to your computer and use it in GitHub Desktop.
Find the biggest set of anagrams!
"""
An implementation of the Fermat's library anagram detection
https://twitter.com/fermatslibrary/status/1275066521450975234
Takes a list of line seperated words and uses prime factors to
encode letters and identify the equivalency classes.
Prints out the top 10 anagram classes for the provided word list
"""
from collections import defaultdict, Counter
import sys
primes = [
2,
3,
5,
7,
11,
13,
17,
19,
23,
29,
31,
37,
41,
43,
47,
53,
59,
61,
67,
71,
73,
79,
83,
89,
97,
101,
]
letters = "abcdefghijklmnopqrstuvwxyz"
letters_to_numbers = {
letters[i]: primes[i] for i in range(26)
}
numbers_to_letters = {
primes[i]: letters[i] for i in range(26)
}
def score(word):
result = 1
for c in word:
c = c.lower() # lowercase
# ignore unrecognized symbols
if c not in letters_to_numbers:
continue
result *= letters_to_numbers[c]
return result
# there were ~100k words in the dictionary
# counting the occurences
c = Counter()
# actually storing the results
full_dict = defaultdict(list)
def disqualified(line):
# remove lines that are boring
# 's? really?
if line.strip()[-2:] == "'s":
return True
# proper nouns, no thank you
if line[0].lower() != line[0]:
return True
# no n' and t' (or s') (or l'), for french
if line.strip()[:2] in {"n'", "t'", "l'", "s'"}:
return True
return False
# a way to generate a word list through aspell (here for french dictionary)
# aspell -d fr dump master | aspell -l fr expand > fr.dict
with open(sys.argv[1], 'r') as word_list:
# keep count to know the speed
for line in word_list:
if disqualified(line):
continue
s = score(line)
c[s] += 1
full_dict[s].append(line)
c.most_common(10)
def factors(n):
# actually give letters, not the numbers
results = defaultdict(int)
current_factor = 2
while n > 1:
if n % current_factor == 0:
n /= current_factor
results[current_factor] += 1
else:
current_factor += 1
# map to the letters
return {
numbers_to_letters[k]: v for k, v in results.items()
}
# show 10 most common with the letters involved
for factor_class, result_count in c.most_common(10):
components = factors(factor_class)
components_pretty = ""
for letter, count in components.items():
components_pretty += letter.upper() * count
print(components_pretty, " has ", result_count, " results")
for w in full_dict[factor_class]:
print(f" - {w.strip()}")
print("================")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment