rtpg/anagram_classes.py

## anagram_classes.py
"""
An implementation of the Fermat's library anagram detection
https://twitter.com/fermatslibrary/status/1275066521450975234

Takes a list of line seperated words and uses prime factors to
encode letters and identify the equivalency classes.

Prints out the top 10 anagram classes for the provided word list
"""
from collections import defaultdict, Counter
import sys

primes = [
    2,
    3,
    5,
    7,
    11,
    13,
    17,
    19,
    23,
    29,
    31,
    37,
    41,
    43,
    47,
    53,
    59,
    61,
    67,
    71,
    73,
    79,
    83,
    89,
    97,
    101,
]

letters = "abcdefghijklmnopqrstuvwxyz"

letters_to_numbers = {
    letters[i]: primes[i] for i in range(26)
}


numbers_to_letters = {
    primes[i]: letters[i] for i in range(26)
}


def score(word):
    result = 1
    for c in word:
        c = c.lower() # lowercase
        # ignore unrecognized symbols
        if c not in letters_to_numbers:
            continue
        result *= letters_to_numbers[c]
    return result

# there were ~100k words in the dictionary
# counting the occurences
c = Counter()
# actually storing the results
full_dict = defaultdict(list)

def disqualified(line):
    # remove lines that are boring
    # 's? really?
    if line.strip()[-2:] == "'s":
        return True
    # proper nouns, no thank you
    if line[0].lower() != line[0]:
        return True
    # no n' and t' (or s') (or l'), for french
    if line.strip()[:2] in {"n'", "t'", "l'", "s'"}:
        return True
    return False

# a way to generate a word list through aspell (here for french dictionary)
# aspell -d fr dump master | aspell -l fr expand > fr.dict
with open(sys.argv[1], 'r') as word_list:
    # keep count to know the speed
    for line in word_list:
        if disqualified(line):
            continue
        s = score(line)
        c[s] += 1
        full_dict[s].append(line)

c.most_common(10)


def factors(n):
    # actually give letters, not the numbers
    results = defaultdict(int)
    current_factor = 2
    while n > 1:
        if n % current_factor == 0:
            n /= current_factor
            results[current_factor] += 1
        else:
            current_factor += 1
    # map to the letters
    return {
        numbers_to_letters[k]: v for k, v in results.items()
    }

# show 10 most common with the letters involved
for factor_class, result_count in c.most_common(10):
    components = factors(factor_class)
    components_pretty = ""
    for letter, count in components.items():
        components_pretty += letter.upper() * count
    print(components_pretty, " has ", result_count, " results")
    for w in full_dict[factor_class]:
        print(f"  - {w.strip()}")
    print("================")
	"""
	An implementation of the Fermat's library anagram detection
	https://twitter.com/fermatslibrary/status/1275066521450975234

	Takes a list of line seperated words and uses prime factors to
	encode letters and identify the equivalency classes.

	Prints out the top 10 anagram classes for the provided word list
	"""
	from collections import defaultdict, Counter
	import sys

	primes = [
	2,
	3,
	5,
	7,
	11,
	13,
	17,
	19,
	23,
	29,
	31,
	37,
	41,
	43,
	47,
	53,
	59,
	61,
	67,
	71,
	73,
	79,
	83,
	89,
	97,
	101,
	]

	letters = "abcdefghijklmnopqrstuvwxyz"

	letters_to_numbers = {
	letters[i]: primes[i] for i in range(26)
	}


	numbers_to_letters = {
	primes[i]: letters[i] for i in range(26)
	}


	def score(word):
	result = 1
	for c in word:
	c = c.lower() # lowercase
	# ignore unrecognized symbols
	if c not in letters_to_numbers:
	continue
	result *= letters_to_numbers[c]
	return result

	# there were ~100k words in the dictionary
	# counting the occurences
	c = Counter()
	# actually storing the results
	full_dict = defaultdict(list)

	def disqualified(line):
	# remove lines that are boring
	# 's? really?
	if line.strip()[-2:] == "'s":
	return True
	# proper nouns, no thank you
	if line[0].lower() != line[0]:
	return True
	# no n' and t' (or s') (or l'), for french
	if line.strip()[:2] in {"n'", "t'", "l'", "s'"}:
	return True
	return False

	# a way to generate a word list through aspell (here for french dictionary)
	# aspell -d fr dump master \| aspell -l fr expand > fr.dict
	with open(sys.argv[1], 'r') as word_list:
	# keep count to know the speed
	for line in word_list:
	if disqualified(line):
	continue
	s = score(line)
	c[s] += 1
	full_dict[s].append(line)

	c.most_common(10)


	def factors(n):
	# actually give letters, not the numbers
	results = defaultdict(int)
	current_factor = 2
	while n > 1:
	if n % current_factor == 0:
	n /= current_factor
	results[current_factor] += 1
	else:
	current_factor += 1
	# map to the letters
	return {
	numbers_to_letters[k]: v for k, v in results.items()
	}

	# show 10 most common with the letters involved
	for factor_class, result_count in c.most_common(10):
	components = factors(factor_class)
	components_pretty = ""
	for letter, count in components.items():
	components_pretty += letter.upper() * count
	print(components_pretty, " has ", result_count, " results")
	for w in full_dict[factor_class]:
	print(f" - {w.strip()}")
	print("================")