Skip to content

Instantly share code, notes, and snippets.

@joodicator
Created November 4, 2021 17:55
Show Gist options
  • Save joodicator/a414fe4bf92bf166457981d75d5f22a2 to your computer and use it in GitHub Desktop.
Save joodicator/a414fe4bf92bf166457981d75d5f22a2 to your computer and use it in GitHub Desktop.
estimate_scrabble_word_count.py
from itertools import combinations
from functools import reduce
from operator import mul
from math import factorial, perm
from collections import Counter
import re
# English letter frequencies from
# <https://en.wikipedia.org/wiki/Letter_frequency>.
eng_letter_freqs = {
'A': 0.078,
'B': 0.02,
'C': 0.04,
'D': 0.038,
'E': 0.11,
'F': 0.014,
'G': 0.03,
'H': 0.023,
'I': 0.086,
'J': 0.0021,
'K': 0.0097,
'L': 0.053,
'M': 0.027,
'N': 0.072,
'O': 0.061,
'P': 0.028,
'Q': 0.0019,
'R': 0.073,
'S': 0.087,
'T': 0.067,
'U': 0.033,
'V': 0.01,
'W': 0.0091,
'X': 0.0027,
'Y': 0.016,
'Z': 0.0044,
}
assert 0.99 < sum(eng_letter_freqs.values()) < 1.01, \
str(sum(eng_letter_freqs.values()))
# Valid SOWPODS words of each length, from
# <https://en.wikipedia.org/wiki/Collins_Scrabble_Words#Word_count>.
valid_words_of_length = {
2: 127,
3: 1347,
4: 5638,
5: 12972,
6: 23033,
7: 34342,
8: 42150,
}
CHAIN_RE = re.compile(r'(.)\1*')
def number_of_words(rack):
rack = sorted(rack)
# The number of times each letters occurs in the rack:
multiplicity = Counter(rack)
# Will be updated to contain the estimated answer:
count = 0.0
for word_length in range(2, 9):
# The number of ways to arrange `word_length` symbols, assuming all of the
# symbols are distinct and that order is significant:
num_permutations = factorial(word_length)
for letters in combinations(rack, word_length):
# #(W : W is a permutation of `letters`, W is a valid word)
# = P(W is a permutation of `letters` | W is a valid word)
# * #(W is a valid word)
# ~= P(L = letters[0]) * ... * P(L = letters[-1])
# * #(W is a permutation of `letters`)
# * #(W is a valid word)
# An estimate of the number of permutations of `letters` which are
# valid words:
partial_count = reduce(mul, (eng_letter_freqs[l] for l in letters)) \
* num_permutations * valid_words_of_length[word_length]
# Examine each (contiguous) set of repeated letters within `letters`:
i = 0
while i < len(letters):
letter = letters[i]
j = i + 1
while j < len(letters) and letters[j] == letter:
j += 1
# `c` is the number of times `letter` is repeated within `letters`, and
# `d` is the number of times `letter` is repeated within `rack`:
c, d, i = j-i, multiplicity[letter], j
if c == d == 1: continue
# Correct for the fact that due to this string of `c` letters:
# 1. the number of times this value of `letters` is encountered
# is `d!/(c! (d-c)!)` times too large; and
# 2. the value of `num_permutations` is `c!` times too large;
# and hence: `partial_count` is `d!/(d-c)!` times too large:
partial_count /= perm(d, c)
count += partial_count
return count
if __name__ == '__main__':
import sys
for arg in sys.argv[1:]:
arg = arg.upper()
print(f'{arg}\t{number_of_words(arg)}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment