Markov-chain generator for Ayeri
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# PROGRAM SETTINGS: ============================================================ | |
HOST = '...' | |
USER = '...' | |
PASSWORD = '...' | |
DATABASE = 'dictdb' | |
QUERY_STATEMENT = ''' | |
SELECT `word` | |
FROM `dictdb`.`words` | |
WHERE `word` REGEXP '^[^ .,;:!?()1234567890-]+-?$' | |
''' | |
WORDFIELD = 'word' # Same as `word` in SELECT clause above | |
# ============================================================================== | |
import Levenshtein | |
import markovify | |
import pymysql.cursors | |
import re | |
import argparse | |
import sys | |
# Connect to dictionary DB | |
try: | |
connection = pymysql.connect( | |
host = HOST, | |
user = USER, | |
password = PASSWORD, | |
db = DATABASE, | |
cursorclass = pymysql.cursors.DictCursor, | |
charset = 'utf8', | |
) | |
except NameError as e: | |
print('** ERROR: Variable {} is not set'.format(e)) | |
except pymysql.err.OperationalError as e: | |
print('** ERROR: {}'.format(e)) | |
try: | |
with connection.cursor() as cursor: | |
# Do the DB query | |
cursor.execute(QUERY_STATEMENT) | |
# We want this as a list of strings rather than a list of dicts | |
# Regular pymysql.cursors.Cursor is not *as* useful since it will | |
# return each line as a tuple. We'll have to extract the single | |
# values either way. | |
lines = [x[WORDFIELD] for x in cursor.fetchall()] | |
finally: | |
connection.close() | |
# Train; a length of 3 seems to balance complexity and novelness best | |
chain = markovify.Chain([list(x.lower()) for x in lines], state_size=3) | |
# Guess at number of syllables by counting vowels and diphthongs | |
# (WARNING: Ayeri specific) | |
def guess_syl_no(word): | |
V = '[aeiouāēīōū]' | |
vowel = re.compile(r'(?:{}(?:y(?!{}))?|au)'.format(V, V)) | |
return len(re.findall(vowel, word)) | |
# Function to draw a new word | |
discarded = 0 | |
def draw_word(): | |
global discarded | |
# Generate a new string | |
random_word = ''.join(list(chain.gen())) | |
# Pattern matching for below's housekeeping | |
# (WARNING: Ayeri specific) | |
nomz = re.search(r'^(.+?)(?:is)?[aā]n$', random_word) | |
vb = re.search(r'^(.+?)(?:is)?a?-$', random_word) | |
# Knock-out criteria | |
# If nothing is generated | |
if random_word is None: | |
return draw_word() | |
# If the word is too similar to an existing word | |
# There seems to be a sweet spot around 0.87 | |
# This is slightly memory intensive with very long word corpuses | |
elif True in [Levenshtein.jaro(random_word, i) > 0.88 for i in lines]: | |
discarded += 1 | |
return draw_word() | |
# AYERI SPECIFIC: ---------------------------------------------------------- | |
# If the word is too long for a simple root | |
elif guess_syl_no(random_word) > 2: | |
discarded += 1 | |
return draw_word() | |
# If the word is a would-be nominalization of an existing corresponding verb | |
# This is slightly memory intensive with very long word corpuses | |
elif bool(nomz) is True and True in [bool(re.match(r'{}(?:is)?a?-'.format( | |
nomz.group(1)), i)) for i in lines]: | |
discarded += 1 | |
return draw_word() | |
# If there is already a nominalization of the generated verb | |
# This is slightly memory intensive with very long word corpuses | |
elif random_word[-1] == '-' and True in [bool(re.match( | |
r'{}(?:is)?[aā]n'.format(vb.group(1)), i)) for i in lines]: | |
discarded += 1 | |
return draw_word() | |
# If a nominalization with the same stem already exists | |
# This is slightly memory intensive with very long word corpuses | |
elif bool(nomz) is True and True in [bool(re.match(r'{}(is)?[aā]n'.format( | |
nomz.group(1)), i)) for i in lines]: | |
discarded += 1 | |
return draw_word() | |
# -------------------------------------------------------------------------- | |
# If finally successful: | |
else: | |
return random_word | |
def main(argv=None): | |
"""Main function providing command line option parser.""" | |
if argv is None: | |
argv = sys.argv[1:] | |
# Parser for command line options | |
parser = argparse.ArgumentParser(description='''Generates new Ayeri words | |
from database input by way of a Markov chain''') | |
parser.add_argument('-n', type=int, default=10, | |
help='''Number of words to generate (default: 10)''') | |
parser.add_argument('-d', '--show-discarded', action='store_const', | |
const=True, default=False, | |
help='''Show the number of generated words discarded, e.g. duplicates | |
of existing words (default: False)''') | |
args = parser.parse_args(argv) | |
# Generate n unique words | |
wordlist = [] | |
while len(wordlist) < args.n: | |
word = draw_word() | |
if word not in wordlist: | |
wordlist += [word] | |
if args.show_discarded != True: | |
return '\n'.join(sorted(wordlist)) | |
else: | |
return '{}\n\n** Discarded: {}\n** Sample size: {}\n'.format( | |
'\n'.join(sorted(wordlist)), | |
discarded, | |
len(lines) | |
) | |
if __name__ == '__main__': | |
sys.stdout.write(main()) | |
sys.exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment