Skip to content

Instantly share code, notes, and snippets.

@carbeck
Created April 26, 2017 19:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save carbeck/c618d00819bae1d4c31937510fcd35d9 to your computer and use it in GitHub Desktop.
Save carbeck/c618d00819bae1d4c31937510fcd35d9 to your computer and use it in GitHub Desktop.
Markov-chain generator for Ayeri
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# PROGRAM SETTINGS: ============================================================
HOST = '...'
USER = '...'
PASSWORD = '...'
DATABASE = 'dictdb'
QUERY_STATEMENT = '''
SELECT `word`
FROM `dictdb`.`words`
WHERE `word` REGEXP '^[^ .,;:!?()1234567890-]+-?$'
'''
WORDFIELD = 'word' # Same as `word` in SELECT clause above
# ==============================================================================
import Levenshtein
import markovify
import pymysql.cursors
import re
import argparse
import sys
# Connect to dictionary DB
try:
connection = pymysql.connect(
host = HOST,
user = USER,
password = PASSWORD,
db = DATABASE,
cursorclass = pymysql.cursors.DictCursor,
charset = 'utf8',
)
except NameError as e:
print('** ERROR: Variable {} is not set'.format(e))
except pymysql.err.OperationalError as e:
print('** ERROR: {}'.format(e))
try:
with connection.cursor() as cursor:
# Do the DB query
cursor.execute(QUERY_STATEMENT)
# We want this as a list of strings rather than a list of dicts
# Regular pymysql.cursors.Cursor is not *as* useful since it will
# return each line as a tuple. We'll have to extract the single
# values either way.
lines = [x[WORDFIELD] for x in cursor.fetchall()]
finally:
connection.close()
# Train; a length of 3 seems to balance complexity and novelness best
chain = markovify.Chain([list(x.lower()) for x in lines], state_size=3)
# Guess at number of syllables by counting vowels and diphthongs
# (WARNING: Ayeri specific)
def guess_syl_no(word):
V = '[aeiouāēīōū]'
vowel = re.compile(r'(?:{}(?:y(?!{}))?|au)'.format(V, V))
return len(re.findall(vowel, word))
# Function to draw a new word
discarded = 0
def draw_word():
global discarded
# Generate a new string
random_word = ''.join(list(chain.gen()))
# Pattern matching for below's housekeeping
# (WARNING: Ayeri specific)
nomz = re.search(r'^(.+?)(?:is)?[aā]n$', random_word)
vb = re.search(r'^(.+?)(?:is)?a?-$', random_word)
# Knock-out criteria
# If nothing is generated
if random_word is None:
return draw_word()
# If the word is too similar to an existing word
# There seems to be a sweet spot around 0.87
# This is slightly memory intensive with very long word corpuses
elif True in [Levenshtein.jaro(random_word, i) > 0.88 for i in lines]:
discarded += 1
return draw_word()
# AYERI SPECIFIC: ----------------------------------------------------------
# If the word is too long for a simple root
elif guess_syl_no(random_word) > 2:
discarded += 1
return draw_word()
# If the word is a would-be nominalization of an existing corresponding verb
# This is slightly memory intensive with very long word corpuses
elif bool(nomz) is True and True in [bool(re.match(r'{}(?:is)?a?-'.format(
nomz.group(1)), i)) for i in lines]:
discarded += 1
return draw_word()
# If there is already a nominalization of the generated verb
# This is slightly memory intensive with very long word corpuses
elif random_word[-1] == '-' and True in [bool(re.match(
r'{}(?:is)?[aā]n'.format(vb.group(1)), i)) for i in lines]:
discarded += 1
return draw_word()
# If a nominalization with the same stem already exists
# This is slightly memory intensive with very long word corpuses
elif bool(nomz) is True and True in [bool(re.match(r'{}(is)?[aā]n'.format(
nomz.group(1)), i)) for i in lines]:
discarded += 1
return draw_word()
# --------------------------------------------------------------------------
# If finally successful:
else:
return random_word
def main(argv=None):
"""Main function providing command line option parser."""
if argv is None:
argv = sys.argv[1:]
# Parser for command line options
parser = argparse.ArgumentParser(description='''Generates new Ayeri words
from database input by way of a Markov chain''')
parser.add_argument('-n', type=int, default=10,
help='''Number of words to generate (default: 10)''')
parser.add_argument('-d', '--show-discarded', action='store_const',
const=True, default=False,
help='''Show the number of generated words discarded, e.g. duplicates
of existing words (default: False)''')
args = parser.parse_args(argv)
# Generate n unique words
wordlist = []
while len(wordlist) < args.n:
word = draw_word()
if word not in wordlist:
wordlist += [word]
if args.show_discarded != True:
return '\n'.join(sorted(wordlist))
else:
return '{}\n\n** Discarded: {}\n** Sample size: {}\n'.format(
'\n'.join(sorted(wordlist)),
discarded,
len(lines)
)
if __name__ == '__main__':
sys.stdout.write(main())
sys.exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment