EvanTheB/markov_hacky.py

## markov_hacky.py
import random
import string
# refs:
# http://eflorenzano.com/blog/2008/11/17/writing-markov-chain-irc-bot-twisted-and-python/
# http://www.ccel.org/ccel/bible/kjv.txt

END = "END"


# https://github.com/akkana/scripts/blob/master/countsyl
verbose = False


def count_syllables(word):
    if word.isdigit():
        return len(word), len(word)

    # wrowl alt
    return sum([x in ''.join([x for idx, x in enumerate(word) if idx == 0 or word[
               idx - 1] != word[idx]]).rstrip('e') for x in list('aeiouy')]), 0

    vowels = ['a', 'e', 'i', 'o', 'u']

    on_vowel = False
    in_diphthong = False
    minsyl = 0
    maxsyl = 0
    lastchar = None

    word = word.lower()


    for c in word:
        is_vowel = c in vowels

        if on_vowel is None:
            on_vowel = is_vowel

        # y is a special case
        if c == 'y':
            is_vowel = not on_vowel

        if is_vowel:
            if verbose:
                print c, "is a vowel"
            if not on_vowel:
                # We weren't on a vowel before.
                # Seeing a new vowel bumps the syllable count.
                if verbose:
                    print "new syllable"
                minsyl += 1
                maxsyl += 1
            elif on_vowel and not in_diphthong and c != lastchar:
                # We were already in a vowel.
                # Don't increment anything except the max count,
                # and only do that once per diphthong.
                if verbose:
                    print c, "is a diphthong"
                in_diphthong = True
                maxsyl += 1
        elif verbose:
            print "[consonant]"

        on_vowel = is_vowel
        lastchar = c

    # Some special cases:
    if word[-1] == 'e':
        minsyl -= 1
    # if it ended with a consonant followed by y, count that as a syllable.
    if word[-1] == 'y' and not on_vowel:
        maxsyl += 1

    return minsyl, maxsyl


class Markov(object):

    """docstring for Markov"""

    def __init__(self, corpus, chain_len):
        self.corpus = []
        self.flat_corpus = ""
        self.markov = {}
        self.chain_len = chain_len

        for s in corpus:
            self.feed(s)

    def feed(self, msg):
        def add(key, val):
            self.markov.setdefault(key, []).append(val)
        self.corpus += msg
        self.flat_corpus += "".join(msg)

        buf = []
        for word in msg:
            add(tuple(buf), word)
            buf.append(word)
            if len(buf) > self.chain_len:
                del buf[0]
        add(tuple(buf), END)

    def generate(self, seed=None, max_words=100):
        seed = [] if not seed else seed
        out = []
        # look for the seed in the dict by excluding head
        while seed:
            if tuple(seed) in self.markov:
                break
            out.append(seed.pop(0))

        if len(seed):
            out += list(seed)
        else:
            out = []
        buf = list(seed)
        for i in xrange(max_words):
            next_word = random.choice(self.markov[tuple(buf)])
            # print "B", buf
            # print "C", len(markov[tuple(buf)])
            # print "M", markov[tuple(buf)]
            # print
            if next_word == END:
                break
            buf.append(next_word)
            if len(buf) > self.chain_len:
                del buf[0]
            out.append(next_word)
        return out

    def generate_original(self, seed=None, max_words=100, min_words=0):
        for i in range(1000):
            s = self.generate(seed=seed, max_words=max_words)
            if not self.is_in_corpus(s) and len(s) > min_words:
                return s
            print "not original:", s
        return []

    def generate_syllables(self, syllables, seed=None):
        for i in range(1000):
            s = self.generate(seed=seed)
            if not self.is_in_corpus(s) and \
                sum(count_syllables(w)[0] for w in s) == syllables:
                # print s, [count_syllables(w)[0] for w in s]
                return s
        return []

    def is_in_corpus(self, words):
        # print words
        sub = "".join(words)
        return sub in self.flat_corpus

    def get_prob(self, words):
        words = list(words)
        words.reverse()
        buf = []
        den = num = 1.
        while len(words):
            next_word = words.pop()
            den *= len([True for w in self.markov[tuple(buf)]
                        if w is next_word])
            num *= len(self.markov[tuple(buf)])

            buf.append(next_word)
            if len(buf) > self.chain_len:
                del buf[0]
        return den / num

punct = "\"#$%&()*+,-/:;<=>@[\\]^_`{|}~"
trans_table = string.maketrans(punct, " " * len(punct))


def ennormalisation(strang):
    return strang.lower().translate(trans_table)


def get_sentence(filename):
    """
    given filename, yield santised sentences as word list
    Split on .?!
    """

    words = []
    with open(filename) as corp:
        for l in corp:
            l = ennormalisation(l)
            words += l.strip().split()
    words.reverse()
    buf = []
    while len(words):
        buf.append(words.pop())
        if buf[-1].endswith(".")\
                or buf[-1].endswith("?")\
                or buf[-1].endswith("!"):
            yield buf
            buf = []


def get_sentence_lines(filename):
    """
    given filename, yield santised sentences as word list
    Split on newline
    """

    words = []
    with open(filename) as corp:
        for l in corp:
            if l == "":
                continue
            l = ennormalisation(l)
            words.append(l.strip().split())
    while len(words):
        buf = words.pop()
        yield buf


import sys
import os

from willie.module import commands, rule


@commands('haiku')
def markov_haiku(bot, trigger):
    """
    haiku
    """
    try:
        # bot.say(" ".join(bot.memory['markov'].generate_syllables(5)))
        # bot.say(" ".join(bot.memory['markov'].generate_syllables(7)))
        # bot.say(" ".join(bot.memory['markov'].generate_syllables(5)))
        seed = ennormalisation(str(trigger.group()))
        seed = seed.split()[1:]
        for i in range(50):
            ret = bot.memory['markov'].generate_syllables(17, seed=seed)
            msg = []
            for i in range(len(ret)):
                if sum(count_syllables(w)[0] for w in ret[:i]) == 5:
                    msg.append(ret[:i])
                    # print sum(count_syllables(w)[0] for w in msg[0])
                    ret = ret[i:]
                    break
            else:
                continue
            for i in range(len(ret)):
                if sum(count_syllables(w)[0] for w in ret[:i]) == 7:
                    msg.append(ret[:i])
                    ret = ret[i:]
                    break
            else:
                continue
            for i in range(len(ret)):
                if sum(count_syllables(w)[0] for w in ret[:i]) == 5:
                    msg.append(ret[:])
                    ret = ret[i:]
                    break
            else:
                continue
            bot.say(" ".join(msg[0]))
            bot.say(" ".join(msg[1]))
            bot.say(" ".join(msg[2]))
            print "haiku spill:", ret
            break
        else:
            bot.say("no soup for you")

    except UnicodeEncodeError as e:
        pass


@commands('ebooks')
def markov_excrete(bot, trigger):
    """
    Ebooks
    """
    try:
        seed = ennormalisation(str(trigger.group()))
        seed = seed.split()[1:]
        print "seed:", seed
        ret = bot.memory['markov'].generate_original(seed=seed, max_words=40)
        if len(" ".join(ret)):
            bot.say(" ".join(ret))
        else:
            bot.say("I shan't!")
    except UnicodeEncodeError as e:
        pass


@rule(r'(?!\.).*')
def markov_feed(bot, trigger):
    try:
        with open("filtered.log", 'a') as f:
            f.write(str(trigger.group()) + "\n")
        feed = ennormalisation(str(trigger.group()))
        feed = feed.split()
        print "markov feed:", feed
        bot.memory['markov'].feed(feed)
    except UnicodeEncodeError as e:
        pass


def setup(bot):
    bot.memory['markov'] = Markov(
        get_sentence_lines("filtered.log"),
        2)

def shutdown(bot):
    del bot.memory['markov']


if __name__ == '__main__':


    # print [count_syllables(w) for w in ['sick','he','is','so','amazing']]
    # my_little_markov = Markov(get_sentence("bible.txt"), 3)
    # my_little_markov = Markov(get_sentence("jimstone.txt"), 3)
    # my_little_markov = Markov(get_sentence("timecube.txt"), 3)
    my_little_markov = Markov(get_sentence_lines("filtered.log"), 3)
    for i in range(50):
        # words = my_little_markov.generate_original(seed=[str(u'human')])
        # print " ".join(words)
        # print my_little_markov.get_prob(words)
        # print my_little_markov.is_in_corpus(words)
        # print " ".join(my_little_markov.generate_syllables(5))
        # print " ".join(my_little_markov.generate_syllables(7))
        # print " ".join(my_little_markov.generate_syllables(5))
        # print
        pass
	import random
	import string
	# refs:
	# http://eflorenzano.com/blog/2008/11/17/writing-markov-chain-irc-bot-twisted-and-python/
	# http://www.ccel.org/ccel/bible/kjv.txt

	END = "END"


	# https://github.com/akkana/scripts/blob/master/countsyl
	verbose = False


	def count_syllables(word):
	if word.isdigit():
	return len(word), len(word)

	# wrowl alt
	return sum([x in ''.join([x for idx, x in enumerate(word) if idx == 0 or word[
	idx - 1] != word[idx]]).rstrip('e') for x in list('aeiouy')]), 0

	vowels = ['a', 'e', 'i', 'o', 'u']

	on_vowel = False
	in_diphthong = False
	minsyl = 0
	maxsyl = 0
	lastchar = None

	word = word.lower()


	for c in word:
	is_vowel = c in vowels

	if on_vowel is None:
	on_vowel = is_vowel

	# y is a special case
	if c == 'y':
	is_vowel = not on_vowel

	if is_vowel:
	if verbose:
	print c, "is a vowel"
	if not on_vowel:
	# We weren't on a vowel before.
	# Seeing a new vowel bumps the syllable count.
	if verbose:
	print "new syllable"
	minsyl += 1
	maxsyl += 1
	elif on_vowel and not in_diphthong and c != lastchar:
	# We were already in a vowel.
	# Don't increment anything except the max count,
	# and only do that once per diphthong.
	if verbose:
	print c, "is a diphthong"
	in_diphthong = True
	maxsyl += 1
	elif verbose:
	print "[consonant]"

	on_vowel = is_vowel
	lastchar = c

	# Some special cases:
	if word[-1] == 'e':
	minsyl -= 1
	# if it ended with a consonant followed by y, count that as a syllable.
	if word[-1] == 'y' and not on_vowel:
	maxsyl += 1

	return minsyl, maxsyl


	class Markov(object):

	"""docstring for Markov"""

	def __init__(self, corpus, chain_len):
	self.corpus = []
	self.flat_corpus = ""
	self.markov = {}
	self.chain_len = chain_len

	for s in corpus:
	self.feed(s)

	def feed(self, msg):
	def add(key, val):
	self.markov.setdefault(key, []).append(val)
	self.corpus += msg
	self.flat_corpus += "".join(msg)

	buf = []
	for word in msg:
	add(tuple(buf), word)
	buf.append(word)
	if len(buf) > self.chain_len:
	del buf[0]
	add(tuple(buf), END)

	def generate(self, seed=None, max_words=100):
	seed = [] if not seed else seed
	out = []
	# look for the seed in the dict by excluding head
	while seed:
	if tuple(seed) in self.markov:
	break
	out.append(seed.pop(0))

	if len(seed):
	out += list(seed)
	else:
	out = []
	buf = list(seed)
	for i in xrange(max_words):
	next_word = random.choice(self.markov[tuple(buf)])
	# print "B", buf
	# print "C", len(markov[tuple(buf)])
	# print "M", markov[tuple(buf)]
	# print
	if next_word == END:
	break
	buf.append(next_word)
	if len(buf) > self.chain_len:
	del buf[0]
	out.append(next_word)
	return out

	def generate_original(self, seed=None, max_words=100, min_words=0):
	for i in range(1000):
	s = self.generate(seed=seed, max_words=max_words)
	if not self.is_in_corpus(s) and len(s) > min_words:
	return s
	print "not original:", s
	return []

	def generate_syllables(self, syllables, seed=None):
	for i in range(1000):
	s = self.generate(seed=seed)
	if not self.is_in_corpus(s) and \
	sum(count_syllables(w)[0] for w in s) == syllables:
	# print s, [count_syllables(w)[0] for w in s]
	return s
	return []

	def is_in_corpus(self, words):
	# print words
	sub = "".join(words)
	return sub in self.flat_corpus

	def get_prob(self, words):
	words = list(words)
	words.reverse()
	buf = []
	den = num = 1.
	while len(words):
	next_word = words.pop()
	den *= len([True for w in self.markov[tuple(buf)]
	if w is next_word])
	num *= len(self.markov[tuple(buf)])

	buf.append(next_word)
	if len(buf) > self.chain_len:
	del buf[0]
	return den / num

	punct = "\"#$%&()*+,-/:;<=>@[\\]^_`{\|}~"
	trans_table = string.maketrans(punct, " " * len(punct))


	def ennormalisation(strang):
	return strang.lower().translate(trans_table)


	def get_sentence(filename):
	"""
	given filename, yield santised sentences as word list
	Split on .?!
	"""

	words = []
	with open(filename) as corp:
	for l in corp:
	l = ennormalisation(l)
	words += l.strip().split()
	words.reverse()
	buf = []
	while len(words):
	buf.append(words.pop())
	if buf[-1].endswith(".")\
	or buf[-1].endswith("?")\
	or buf[-1].endswith("!"):
	yield buf
	buf = []


	def get_sentence_lines(filename):
	"""
	given filename, yield santised sentences as word list
	Split on newline
	"""

	words = []
	with open(filename) as corp:
	for l in corp:
	if l == "":
	continue
	l = ennormalisation(l)
	words.append(l.strip().split())
	while len(words):
	buf = words.pop()
	yield buf



	import sys
	import os

	from willie.module import commands, rule


	@commands('haiku')
	def markov_haiku(bot, trigger):
	"""
	haiku
	"""
	try:
	# bot.say(" ".join(bot.memory['markov'].generate_syllables(5)))
	# bot.say(" ".join(bot.memory['markov'].generate_syllables(7)))
	# bot.say(" ".join(bot.memory['markov'].generate_syllables(5)))
	seed = ennormalisation(str(trigger.group()))
	seed = seed.split()[1:]
	for i in range(50):
	ret = bot.memory['markov'].generate_syllables(17, seed=seed)
	msg = []
	for i in range(len(ret)):
	if sum(count_syllables(w)[0] for w in ret[:i]) == 5:
	msg.append(ret[:i])
	# print sum(count_syllables(w)[0] for w in msg[0])
	ret = ret[i:]
	break
	else:
	continue
	for i in range(len(ret)):
	if sum(count_syllables(w)[0] for w in ret[:i]) == 7:
	msg.append(ret[:i])
	ret = ret[i:]
	break
	else:
	continue
	for i in range(len(ret)):
	if sum(count_syllables(w)[0] for w in ret[:i]) == 5:
	msg.append(ret[:])
	ret = ret[i:]
	break
	else:
	continue
	bot.say(" ".join(msg[0]))
	bot.say(" ".join(msg[1]))
	bot.say(" ".join(msg[2]))
	print "haiku spill:", ret
	break
	else:
	bot.say("no soup for you")

	except UnicodeEncodeError as e:
	pass


	@commands('ebooks')
	def markov_excrete(bot, trigger):
	"""
	Ebooks
	"""
	try:
	seed = ennormalisation(str(trigger.group()))
	seed = seed.split()[1:]
	print "seed:", seed
	ret = bot.memory['markov'].generate_original(seed=seed, max_words=40)
	if len(" ".join(ret)):
	bot.say(" ".join(ret))
	else:
	bot.say("I shan't!")
	except UnicodeEncodeError as e:
	pass


	@rule(r'(?!\.).*')
	def markov_feed(bot, trigger):
	try:
	with open("filtered.log", 'a') as f:
	f.write(str(trigger.group()) + "\n")
	feed = ennormalisation(str(trigger.group()))
	feed = feed.split()
	print "markov feed:", feed
	bot.memory['markov'].feed(feed)
	except UnicodeEncodeError as e:
	pass


	def setup(bot):
	bot.memory['markov'] = Markov(
	get_sentence_lines("filtered.log"),
	2)

	def shutdown(bot):
	del bot.memory['markov']


	if __name__ == '__main__':


	# print [count_syllables(w) for w in ['sick','he','is','so','amazing']]
	# my_little_markov = Markov(get_sentence("bible.txt"), 3)
	# my_little_markov = Markov(get_sentence("jimstone.txt"), 3)
	# my_little_markov = Markov(get_sentence("timecube.txt"), 3)
	my_little_markov = Markov(get_sentence_lines("filtered.log"), 3)
	for i in range(50):
	# words = my_little_markov.generate_original(seed=[str(u'human')])
	# print " ".join(words)
	# print my_little_markov.get_prob(words)
	# print my_little_markov.is_in_corpus(words)
	# print " ".join(my_little_markov.generate_syllables(5))
	# print " ".join(my_little_markov.generate_syllables(7))
	# print " ".join(my_little_markov.generate_syllables(5))
	# print
	pass