Created
June 2, 2021 01:01
-
-
Save tylerneylon/0eca66eb0b4a4165f2e72530bdd1cc03 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" some_random_words.py | |
Usage: | |
./some_random_words.py [letters_start_w_key_letter] | |
Prints out a bunch of random words that use these letters, | |
and which include the key letter. | |
""" | |
import random | |
import sys | |
from collections import Counter | |
CHEAT_MODE = True | |
WORD_LEN = 20 | |
def load_bigram_model(letters): | |
with open('/usr/share/dict/words') as f: | |
words = [line.strip().lower() for line in f] | |
if CHEAT_MODE: | |
words = [ | |
word | |
for word in words | |
if (all(let in letters for let in word) and letters[0] in word) | |
] | |
bigram_counter = Counter() | |
print('Analyzing all words .. ', end='', flush=True) | |
for word in words: | |
word_ = f'_{word}_' | |
for a, b in zip(word_[:-1], word_[1:]): | |
bigram_counter[a + b] += 1 | |
alpha = 'abcdefghijklmnopqrstuvwxyz_' | |
# For each letter L, next_let_hist[L] will be a list of the form: | |
# [(M, prob_of_M_given_L), ...] | |
next_let_hist = {} | |
for let in alpha: | |
next_set = [ | |
(let2, bigram_counter[let + let2] + 1) | |
for let2 in alpha | |
] | |
total = sum(pair[1] for pair in next_set) | |
next_set = [ | |
(pair[0], pair[1] / total) | |
for i, pair in enumerate(next_set) | |
] | |
next_let_hist[let] = next_set | |
print('done!') | |
return next_let_hist | |
def get_random_next_letter(this_letter, next_set): | |
u = random.random() | |
index, total = 0, next_set[0][1] | |
while total < u: | |
index += 1 | |
total += next_set[index][1] | |
return next_set[index][0] | |
def filter_to_ok_letters(ok_letters, next_set): | |
filtered_set = [pair for pair in next_set if pair[0] in ok_letters] | |
total = sum(pair[1] for pair in filtered_set) | |
return [(pair[0], pair[1] / total) for pair in filtered_set] | |
def generate_random_word(ok_letters, next_let_hist): | |
letters = ['_'] | |
while True: | |
next_set = next_let_hist[letters[-1]] | |
ok = ok_letters if len(letters) == 1 else ok_letters + '_' | |
next_set = filter_to_ok_letters(ok, next_set) | |
letters.append(get_random_next_letter(letters[-1], next_set)) | |
if letters[-1] == '_': | |
break | |
word = ''.join(letters)[1:-1] | |
if word == '': | |
word = generate_random_word(next_let_hist) | |
return word | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print(__doc__) | |
sys.exit(0) | |
letters = sys.argv[1] | |
key_letter = letters[0] | |
next_let_hist = load_bigram_model(letters) | |
for _ in range(500): | |
word = '' | |
while key_letter not in word: | |
word = generate_random_word(letters, next_let_hist) | |
print(word) | |
if False: | |
word = [] | |
while key_letter not in word: | |
word = [random.choice(letters) for _ in range(20)] | |
print(''.join(word)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment