Skip to content

Instantly share code, notes, and snippets.

@noveoko
Created March 26, 2022 19:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save noveoko/07acae21a5b436f33bdbee32cc240154 to your computer and use it in GitHub Desktop.
Save noveoko/07acae21a5b436f33bdbee32cc240154 to your computer and use it in GitHub Desktop.
naive text generation using next character statistics
from collections import Counter, defaultdict
import re
import random
MIN_WORD_LEN = 3
PATH = 'src/usa_last_names.txt'
#PATH = 'src/news_article.txt'
def word_size_frequency(words:list[str]):
return Counter([len(word) for word in words])
def list_of_ints_to_1_0(list_of_ints):
return [a/sum(list_of_ints) for a in list_of_ints]
def word_order_frequency(word):
return [(index,a) for index,a in enumerate(word)]
def document_to_words(file_path:str='src/usa_last_names.txt'):
pattern = re.compile(r'\w+')
words = []
for a in open(file_path).readlines():
words_fetch = pattern.findall(a)
clean_words = [a.lower() for a in words_fetch]
words.extend(clean_words)
return words
def calculate_probabilities(words:list[str]):
max_word_len = max([len(word) for word in words])
character_space = {word[0]:[[]for a in range(1,max_word_len+1)] for word in words}
for word in words:
start = word[0]
for index, character in enumerate(word):
if index > 0:
character_space[start][index].append(character)
final_probabilities = {}
for k,v in character_space.items():
for index, list_of_characters in enumerate(v):
#print(k, index, list_of_characters[0:5])
count_of_chars = Counter(list_of_characters)
#print(k, index, count_of_chars.most_common(3))
symbols = list(count_of_chars.keys())
probabilities = list_of_ints_to_1_0(list(count_of_chars.values()))
final_probabilities[(k,index)] = (symbols, probabilities)
return final_probabilities
word_probabilities = calculate_probabilities(document_to_words(PATH))
def generate_word(start_char="A",word_size:int=5):
word = f'{start_char}'
for i in range(1,word_size):
try:
next_char = random.choices(word_probabilities[(start_char,i)][0], word_probabilities[(start_char,i)][1])[0]
word += next_char
except Exception as ee:
print(ee)
return word
def generate_bulk_words(number_of_words:int=100):
words = set()
while len(list(words)) < number_of_words:
size = random.choice([5,6,7,8,9])
start = random.choice(list(word_probabilities.keys()))[0]
word = generate_word(start_char=start, word_size=size)
words.add(word)
return words
with open('src/one_million_words.txt', 'a') as f:
words_written = 0
while words_written < 1000000:
new_words = generate_bulk_words(number_of_words=100)
words_written+=len(new_words)
for word in new_words:
f.write(word + '\n')
print('Written:', words_written)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment