noveoko/generate_words.py

## generate_words.py
from collections import Counter, defaultdict
import re
import random


MIN_WORD_LEN = 3
PATH = 'src/usa_last_names.txt'
#PATH = 'src/news_article.txt'

def word_size_frequency(words:list[str]):
    return Counter([len(word) for word in words])

def list_of_ints_to_1_0(list_of_ints):
    return [a/sum(list_of_ints) for a in list_of_ints]

def word_order_frequency(word):
    return [(index,a) for index,a in enumerate(word)]

def document_to_words(file_path:str='src/usa_last_names.txt'):
    pattern = re.compile(r'\w+')
    words = []
    for a in open(file_path).readlines():
        words_fetch = pattern.findall(a)
        clean_words = [a.lower() for a in words_fetch]
        words.extend(clean_words)
    return words

def calculate_probabilities(words:list[str]):
    max_word_len = max([len(word) for word in words])
    character_space = {word[0]:[[]for a in range(1,max_word_len+1)] for word in words}

    for word in words:
        start = word[0]
        for index, character in enumerate(word):
            if index > 0:
                character_space[start][index].append(character)

    final_probabilities = {}
    for k,v in character_space.items():
        for index, list_of_characters in enumerate(v):
            #print(k, index, list_of_characters[0:5])
            count_of_chars = Counter(list_of_characters)
            #print(k, index, count_of_chars.most_common(3))
            symbols = list(count_of_chars.keys())
            probabilities = list_of_ints_to_1_0(list(count_of_chars.values()))
            final_probabilities[(k,index)] = (symbols, probabilities)
    return final_probabilities

word_probabilities = calculate_probabilities(document_to_words(PATH))

def generate_word(start_char="A",word_size:int=5):
    word = f'{start_char}'
    for i in range(1,word_size):
        try:
            next_char = random.choices(word_probabilities[(start_char,i)][0], word_probabilities[(start_char,i)][1])[0]
            word += next_char
        except Exception as ee:
            print(ee)
    return word


def generate_bulk_words(number_of_words:int=100):
    words = set()
    while len(list(words)) < number_of_words:
        size = random.choice([5,6,7,8,9])
        start = random.choice(list(word_probabilities.keys()))[0]
        word = generate_word(start_char=start, word_size=size)
        words.add(word)
    return words

with open('src/one_million_words.txt', 'a') as f:
    words_written = 0
    while words_written < 1000000:
        new_words = generate_bulk_words(number_of_words=100)
        words_written+=len(new_words)
        for word in new_words:
            f.write(word + '\n')
        print('Written:', words_written)
	from collections import Counter, defaultdict
	import re
	import random


	MIN_WORD_LEN = 3
	PATH = 'src/usa_last_names.txt'
	#PATH = 'src/news_article.txt'

	def word_size_frequency(words:list[str]):
	return Counter([len(word) for word in words])

	def list_of_ints_to_1_0(list_of_ints):
	return [a/sum(list_of_ints) for a in list_of_ints]

	def word_order_frequency(word):
	return [(index,a) for index,a in enumerate(word)]

	def document_to_words(file_path:str='src/usa_last_names.txt'):
	pattern = re.compile(r'\w+')
	words = []
	for a in open(file_path).readlines():
	words_fetch = pattern.findall(a)
	clean_words = [a.lower() for a in words_fetch]
	words.extend(clean_words)
	return words

	def calculate_probabilities(words:list[str]):
	max_word_len = max([len(word) for word in words])
	character_space = {word[0]:[[]for a in range(1,max_word_len+1)] for word in words}

	for word in words:
	start = word[0]
	for index, character in enumerate(word):
	if index > 0:
	character_space[start][index].append(character)

	final_probabilities = {}
	for k,v in character_space.items():
	for index, list_of_characters in enumerate(v):
	#print(k, index, list_of_characters[0:5])
	count_of_chars = Counter(list_of_characters)
	#print(k, index, count_of_chars.most_common(3))
	symbols = list(count_of_chars.keys())
	probabilities = list_of_ints_to_1_0(list(count_of_chars.values()))
	final_probabilities[(k,index)] = (symbols, probabilities)
	return final_probabilities

	word_probabilities = calculate_probabilities(document_to_words(PATH))

	def generate_word(start_char="A",word_size:int=5):
	word = f'{start_char}'
	for i in range(1,word_size):
	try:
	next_char = random.choices(word_probabilities[(start_char,i)][0], word_probabilities[(start_char,i)][1])[0]
	word += next_char
	except Exception as ee:
	print(ee)
	return word


	def generate_bulk_words(number_of_words:int=100):
	words = set()
	while len(list(words)) < number_of_words:
	size = random.choice([5,6,7,8,9])
	start = random.choice(list(word_probabilities.keys()))[0]
	word = generate_word(start_char=start, word_size=size)
	words.add(word)
	return words

	with open('src/one_million_words.txt', 'a') as f:
	words_written = 0
	while words_written < 1000000:
	new_words = generate_bulk_words(number_of_words=100)
	words_written+=len(new_words)
	for word in new_words:
	f.write(word + '\n')
	print('Written:', words_written)