Skip to content

Instantly share code, notes, and snippets.

@1ort
Created April 16, 2024 08:44
Show Gist options
  • Save 1ort/28459e01944fd43df39241be5f476a63 to your computer and use it in GitHub Desktop.
Save 1ort/28459e01944fd43df39241be5f476a63 to your computer and use it in GitHub Desktop.
Simple name generation via markov's chains
import random
from collections import defaultdict
from itertools import islice
def batched(iterable, n):
"Batch data into tuples of length n. The last batch may be shorter."
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError('n must be at least one')
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch
def batched_str(s, n):
for b in batched(s, n):
yield ''.join(b)
class MarkovChain:
def __init__(self, token_max_length):
self.transition_matrix = defaultdict(lambda: defaultdict(int))
self.token_max_length = token_max_length
def train(self, words):
for word in words:
word = word.lower().strip()
tokens = batched_str(word, self.token_max_length)
last_token = None
for token in tokens:
self.transition_matrix[last_token][token] += 1
last_token = token
self.transition_matrix[last_token][None] += 1
# for i in range(len(name)):
# current_state = name[i:i+n]
# next_state = name[i+n]
# if current_state not in self.transition_matrix:
# self.transition_matrix[current_state] = {}
# if next_state not in self.transition_matrix[current_state]:
# self.transition_matrix[current_state][next_state] = 0
# self.transition_matrix[current_state][next_state] += 1
def next_state(self, current_state):
current_state = current_state or None
weights = list(self.transition_matrix[current_state].values())
next_states = list(self.transition_matrix[current_state].keys())
# print(current_state, weights, next_states)
return random.choices(next_states, weights=weights)[0]
def generate_name(self, max_length=10):
name_parts = ["", ]
while sum(len(part) for part in name_parts) < max_length:
next_token = self.next_state(name_parts[-1])
if next_token is None:
break
name_parts.append(next_token)
return "".join(name_parts)
# Example usage
names = [
"Lagdush",
"Groduf",
"Buga",
"Uglush",
"Lurtzog",
"Lugduf",
"Grat",
"Gorkil",
"Bolga",
"Snakhak",
"Mega",
"Luga",
"Gorkil",
"Balagd",
"Grat",
"Rat",
"Agdur",
"Balurtz",
"Lagduf",
"Lurtzog",
"Feanore",
"Fingormin",
"Ithil",
"Celenwe",
"Makili",
"Pengoli",
"Throdore",
"Maedhrondir",
"Gilgali",
"Ebrin",
"Eneleg",
"Galadel",
"Alamras",
"Irdahil",
"Enlor",
"Elelung",
"Elron",
"Araliod",
"Finore",
"Gelmire",
]
for token_length in range(1, 2):
print("token length: ", token_length)
markov_chain = MarkovChain(token_length)
markov_chain.train(names)
for i in range(10):
print(markov_chain.generate_name().capitalize())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment