Created
March 26, 2022 19:24
-
-
Save noveoko/07acae21a5b436f33bdbee32cc240154 to your computer and use it in GitHub Desktop.
naive text generation using next character statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter, defaultdict | |
import re | |
import random | |
MIN_WORD_LEN = 3 | |
PATH = 'src/usa_last_names.txt' | |
#PATH = 'src/news_article.txt' | |
def word_size_frequency(words:list[str]): | |
return Counter([len(word) for word in words]) | |
def list_of_ints_to_1_0(list_of_ints): | |
return [a/sum(list_of_ints) for a in list_of_ints] | |
def word_order_frequency(word): | |
return [(index,a) for index,a in enumerate(word)] | |
def document_to_words(file_path:str='src/usa_last_names.txt'): | |
pattern = re.compile(r'\w+') | |
words = [] | |
for a in open(file_path).readlines(): | |
words_fetch = pattern.findall(a) | |
clean_words = [a.lower() for a in words_fetch] | |
words.extend(clean_words) | |
return words | |
def calculate_probabilities(words:list[str]): | |
max_word_len = max([len(word) for word in words]) | |
character_space = {word[0]:[[]for a in range(1,max_word_len+1)] for word in words} | |
for word in words: | |
start = word[0] | |
for index, character in enumerate(word): | |
if index > 0: | |
character_space[start][index].append(character) | |
final_probabilities = {} | |
for k,v in character_space.items(): | |
for index, list_of_characters in enumerate(v): | |
#print(k, index, list_of_characters[0:5]) | |
count_of_chars = Counter(list_of_characters) | |
#print(k, index, count_of_chars.most_common(3)) | |
symbols = list(count_of_chars.keys()) | |
probabilities = list_of_ints_to_1_0(list(count_of_chars.values())) | |
final_probabilities[(k,index)] = (symbols, probabilities) | |
return final_probabilities | |
word_probabilities = calculate_probabilities(document_to_words(PATH)) | |
def generate_word(start_char="A",word_size:int=5): | |
word = f'{start_char}' | |
for i in range(1,word_size): | |
try: | |
next_char = random.choices(word_probabilities[(start_char,i)][0], word_probabilities[(start_char,i)][1])[0] | |
word += next_char | |
except Exception as ee: | |
print(ee) | |
return word | |
def generate_bulk_words(number_of_words:int=100): | |
words = set() | |
while len(list(words)) < number_of_words: | |
size = random.choice([5,6,7,8,9]) | |
start = random.choice(list(word_probabilities.keys()))[0] | |
word = generate_word(start_char=start, word_size=size) | |
words.add(word) | |
return words | |
with open('src/one_million_words.txt', 'a') as f: | |
words_written = 0 | |
while words_written < 1000000: | |
new_words = generate_bulk_words(number_of_words=100) | |
words_written+=len(new_words) | |
for word in new_words: | |
f.write(word + '\n') | |
print('Written:', words_written) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment