carbeck/ayerimarkov.py Secret

## ayerimarkov.py
#! /usr/bin/env python3
# -*- coding: utf-8 -*-

# PROGRAM SETTINGS: ============================================================

HOST = '...'
USER = '...'
PASSWORD = '...'
DATABASE = 'dictdb'
QUERY_STATEMENT = '''
                    SELECT `word`
                    FROM `dictdb`.`words`
                    WHERE `word` REGEXP '^[^ .,;:!?()1234567890-]+-?$'
                '''
WORDFIELD = 'word' # Same as `word` in SELECT clause above

# ==============================================================================

import Levenshtein
import markovify
import pymysql.cursors
import re
import argparse
import sys

# Connect to dictionary DB
try:
    connection = pymysql.connect(
        host = HOST,
        user = USER,
        password = PASSWORD,
        db = DATABASE,
        cursorclass = pymysql.cursors.DictCursor,
        charset = 'utf8',
    )

except NameError as e:
    print('** ERROR: Variable {} is not set'.format(e))

except pymysql.err.OperationalError as e:
    print('** ERROR: {}'.format(e))

try:
    with connection.cursor() as cursor:

        # Do the DB query
        cursor.execute(QUERY_STATEMENT)

        # We want this as a list of strings rather than a list of dicts
        # Regular pymysql.cursors.Cursor is not *as* useful since it will
        # return each line as a tuple. We'll have to extract the single
        # values either way.
        lines = [x[WORDFIELD] for x in cursor.fetchall()]

finally:
    connection.close()

# Train; a length of 3 seems to balance complexity and novelness best
chain = markovify.Chain([list(x.lower()) for x in lines], state_size=3)

# Guess at number of syllables by counting vowels and diphthongs
# (WARNING: Ayeri specific)
def guess_syl_no(word):
    V = '[aeiouāēīōū]'
    vowel = re.compile(r'(?:{}(?:y(?!{}))?|au)'.format(V, V))
    return len(re.findall(vowel, word))

# Function to draw a new word
discarded = 0
def draw_word():
    global discarded

    # Generate a new string
    random_word = ''.join(list(chain.gen()))

    # Pattern matching for below's housekeeping
    # (WARNING: Ayeri specific)
    nomz = re.search(r'^(.+?)(?:is)?[aā]n$', random_word)
    vb = re.search(r'^(.+?)(?:is)?a?-$', random_word)

    # Knock-out criteria

    # If nothing is generated
    if random_word is None:
        return draw_word()

    # If the word is too similar to an existing word
    # There seems to be a sweet spot around 0.87
    # This is slightly memory intensive with very long word corpuses
    elif True in [Levenshtein.jaro(random_word, i) > 0.88 for i in lines]:
        discarded += 1
        return draw_word()

    # AYERI SPECIFIC: ----------------------------------------------------------

    # If the word is too long for a simple root
    elif guess_syl_no(random_word) > 2:
        discarded += 1
        return draw_word()

    # If the word is a would-be nominalization of an existing corresponding verb
    # This is slightly memory intensive with very long word corpuses
    elif bool(nomz) is True and True in [bool(re.match(r'{}(?:is)?a?-'.format(
            nomz.group(1)), i)) for i in lines]:
        discarded += 1
        return draw_word()

    # If there is already a nominalization of the generated verb
    # This is slightly memory intensive with very long word corpuses
    elif random_word[-1] == '-' and True in [bool(re.match(
            r'{}(?:is)?[aā]n'.format(vb.group(1)), i)) for i in lines]:
        discarded += 1
        return draw_word()

    # If a nominalization with the same stem already exists
    # This is slightly memory intensive with very long word corpuses
    elif bool(nomz) is True and True in [bool(re.match(r'{}(is)?[aā]n'.format(
            nomz.group(1)), i)) for i in lines]:
        discarded += 1
        return draw_word()

    # --------------------------------------------------------------------------

    # If finally successful:
    else:
        return random_word

def main(argv=None):
    """Main function providing command line option parser."""
    if argv is None:
        argv = sys.argv[1:]

    # Parser for command line options
    parser = argparse.ArgumentParser(description='''Generates new Ayeri words
        from database input by way of a Markov chain''')
    parser.add_argument('-n', type=int, default=10,
        help='''Number of words to generate (default: 10)''')
    parser.add_argument('-d', '--show-discarded', action='store_const',
        const=True, default=False,
        help='''Show the number of generated words discarded, e.g. duplicates
        of existing words (default: False)''')
    args = parser.parse_args(argv)

    # Generate n unique words
    wordlist = []
    while len(wordlist) < args.n:
        word = draw_word()
        if word not in wordlist:
            wordlist += [word]

    if args.show_discarded != True:
        return '\n'.join(sorted(wordlist))
    else:
        return '{}\n\n** Discarded: {}\n** Sample size: {}\n'.format(
            '\n'.join(sorted(wordlist)),
            discarded,
            len(lines)
        )

if __name__ == '__main__':
    sys.stdout.write(main())
    sys.exit()
	#! /usr/bin/env python3
	# -- coding: utf-8 --

	# PROGRAM SETTINGS: ============================================================

	HOST = '...'
	USER = '...'
	PASSWORD = '...'
	DATABASE = 'dictdb'
	QUERY_STATEMENT = '''
	SELECT `word`
	FROM `dictdb`.`words`
	WHERE `word` REGEXP '^[^ .,;:!?()1234567890-]+-?$'
	'''
	WORDFIELD = 'word' # Same as `word` in SELECT clause above

	# ==============================================================================

	import Levenshtein
	import markovify
	import pymysql.cursors
	import re
	import argparse
	import sys

	# Connect to dictionary DB
	try:
	connection = pymysql.connect(
	host = HOST,
	user = USER,
	password = PASSWORD,
	db = DATABASE,
	cursorclass = pymysql.cursors.DictCursor,
	charset = 'utf8',
	)

	except NameError as e:
	print('** ERROR: Variable {} is not set'.format(e))

	except pymysql.err.OperationalError as e:
	print('** ERROR: {}'.format(e))

	try:
	with connection.cursor() as cursor:

	# Do the DB query
	cursor.execute(QUERY_STATEMENT)

	# We want this as a list of strings rather than a list of dicts
	# Regular pymysql.cursors.Cursor is not as useful since it will
	# return each line as a tuple. We'll have to extract the single
	# values either way.
	lines = [x[WORDFIELD] for x in cursor.fetchall()]

	finally:
	connection.close()

	# Train; a length of 3 seems to balance complexity and novelness best
	chain = markovify.Chain([list(x.lower()) for x in lines], state_size=3)

	# Guess at number of syllables by counting vowels and diphthongs
	# (WARNING: Ayeri specific)
	def guess_syl_no(word):
	V = '[aeiouāēīōū]'
	vowel = re.compile(r'(?:{}(?:y(?!{}))?\|au)'.format(V, V))
	return len(re.findall(vowel, word))

	# Function to draw a new word
	discarded = 0
	def draw_word():
	global discarded

	# Generate a new string
	random_word = ''.join(list(chain.gen()))

	# Pattern matching for below's housekeeping
	# (WARNING: Ayeri specific)
	nomz = re.search(r'^(.+?)(?:is)?[aā]n$', random_word)
	vb = re.search(r'^(.+?)(?:is)?a?-$', random_word)

	# Knock-out criteria

	# If nothing is generated
	if random_word is None:
	return draw_word()

	# If the word is too similar to an existing word
	# There seems to be a sweet spot around 0.87
	# This is slightly memory intensive with very long word corpuses
	elif True in [Levenshtein.jaro(random_word, i) > 0.88 for i in lines]:
	discarded += 1
	return draw_word()

	# AYERI SPECIFIC: ----------------------------------------------------------

	# If the word is too long for a simple root
	elif guess_syl_no(random_word) > 2:
	discarded += 1
	return draw_word()

	# If the word is a would-be nominalization of an existing corresponding verb
	# This is slightly memory intensive with very long word corpuses
	elif bool(nomz) is True and True in [bool(re.match(r'{}(?:is)?a?-'.format(
	nomz.group(1)), i)) for i in lines]:
	discarded += 1
	return draw_word()

	# If there is already a nominalization of the generated verb
	# This is slightly memory intensive with very long word corpuses
	elif random_word[-1] == '-' and True in [bool(re.match(
	r'{}(?:is)?[aā]n'.format(vb.group(1)), i)) for i in lines]:
	discarded += 1
	return draw_word()

	# If a nominalization with the same stem already exists
	# This is slightly memory intensive with very long word corpuses
	elif bool(nomz) is True and True in [bool(re.match(r'{}(is)?[aā]n'.format(
	nomz.group(1)), i)) for i in lines]:
	discarded += 1
	return draw_word()

	# --------------------------------------------------------------------------

	# If finally successful:
	else:
	return random_word

	def main(argv=None):
	"""Main function providing command line option parser."""
	if argv is None:
	argv = sys.argv[1:]

	# Parser for command line options
	parser = argparse.ArgumentParser(description='''Generates new Ayeri words
	from database input by way of a Markov chain''')
	parser.add_argument('-n', type=int, default=10,
	help='''Number of words to generate (default: 10)''')
	parser.add_argument('-d', '--show-discarded', action='store_const',
	const=True, default=False,
	help='''Show the number of generated words discarded, e.g. duplicates
	of existing words (default: False)''')
	args = parser.parse_args(argv)

	# Generate n unique words
	wordlist = []
	while len(wordlist) < args.n:
	word = draw_word()
	if word not in wordlist:
	wordlist += [word]

	if args.show_discarded != True:
	return '\n'.join(sorted(wordlist))
	else:
	return '{}\n\n Discarded: {}\n Sample size: {}\n'.format(
	'\n'.join(sorted(wordlist)),
	discarded,
	len(lines)
	)

	if __name__ == '__main__':
	sys.stdout.write(main())
	sys.exit()