Skip to content

Instantly share code, notes, and snippets.

@EvanTheB
Created September 2, 2015 04:47
Show Gist options
  • Save EvanTheB/1fc685988cd791d3405f to your computer and use it in GitHub Desktop.
Save EvanTheB/1fc685988cd791d3405f to your computer and use it in GitHub Desktop.
import random
import string
# refs:
# http://eflorenzano.com/blog/2008/11/17/writing-markov-chain-irc-bot-twisted-and-python/
# http://www.ccel.org/ccel/bible/kjv.txt
END = "END"
# https://github.com/akkana/scripts/blob/master/countsyl
verbose = False
def count_syllables(word):
if word.isdigit():
return len(word), len(word)
# wrowl alt
return sum([x in ''.join([x for idx, x in enumerate(word) if idx == 0 or word[
idx - 1] != word[idx]]).rstrip('e') for x in list('aeiouy')]), 0
vowels = ['a', 'e', 'i', 'o', 'u']
on_vowel = False
in_diphthong = False
minsyl = 0
maxsyl = 0
lastchar = None
word = word.lower()
for c in word:
is_vowel = c in vowels
if on_vowel is None:
on_vowel = is_vowel
# y is a special case
if c == 'y':
is_vowel = not on_vowel
if is_vowel:
if verbose:
print c, "is a vowel"
if not on_vowel:
# We weren't on a vowel before.
# Seeing a new vowel bumps the syllable count.
if verbose:
print "new syllable"
minsyl += 1
maxsyl += 1
elif on_vowel and not in_diphthong and c != lastchar:
# We were already in a vowel.
# Don't increment anything except the max count,
# and only do that once per diphthong.
if verbose:
print c, "is a diphthong"
in_diphthong = True
maxsyl += 1
elif verbose:
print "[consonant]"
on_vowel = is_vowel
lastchar = c
# Some special cases:
if word[-1] == 'e':
minsyl -= 1
# if it ended with a consonant followed by y, count that as a syllable.
if word[-1] == 'y' and not on_vowel:
maxsyl += 1
return minsyl, maxsyl
class Markov(object):
"""docstring for Markov"""
def __init__(self, corpus, chain_len):
self.corpus = []
self.flat_corpus = ""
self.markov = {}
self.chain_len = chain_len
for s in corpus:
self.feed(s)
def feed(self, msg):
def add(key, val):
self.markov.setdefault(key, []).append(val)
self.corpus += msg
self.flat_corpus += "".join(msg)
buf = []
for word in msg:
add(tuple(buf), word)
buf.append(word)
if len(buf) > self.chain_len:
del buf[0]
add(tuple(buf), END)
def generate(self, seed=None, max_words=100):
seed = [] if not seed else seed
out = []
# look for the seed in the dict by excluding head
while seed:
if tuple(seed) in self.markov:
break
out.append(seed.pop(0))
if len(seed):
out += list(seed)
else:
out = []
buf = list(seed)
for i in xrange(max_words):
next_word = random.choice(self.markov[tuple(buf)])
# print "B", buf
# print "C", len(markov[tuple(buf)])
# print "M", markov[tuple(buf)]
# print
if next_word == END:
break
buf.append(next_word)
if len(buf) > self.chain_len:
del buf[0]
out.append(next_word)
return out
def generate_original(self, seed=None, max_words=100, min_words=0):
for i in range(1000):
s = self.generate(seed=seed, max_words=max_words)
if not self.is_in_corpus(s) and len(s) > min_words:
return s
print "not original:", s
return []
def generate_syllables(self, syllables, seed=None):
for i in range(1000):
s = self.generate(seed=seed)
if not self.is_in_corpus(s) and \
sum(count_syllables(w)[0] for w in s) == syllables:
# print s, [count_syllables(w)[0] for w in s]
return s
return []
def is_in_corpus(self, words):
# print words
sub = "".join(words)
return sub in self.flat_corpus
def get_prob(self, words):
words = list(words)
words.reverse()
buf = []
den = num = 1.
while len(words):
next_word = words.pop()
den *= len([True for w in self.markov[tuple(buf)]
if w is next_word])
num *= len(self.markov[tuple(buf)])
buf.append(next_word)
if len(buf) > self.chain_len:
del buf[0]
return den / num
punct = "\"#$%&()*+,-/:;<=>@[\\]^_`{|}~"
trans_table = string.maketrans(punct, " " * len(punct))
def ennormalisation(strang):
return strang.lower().translate(trans_table)
def get_sentence(filename):
"""
given filename, yield santised sentences as word list
Split on .?!
"""
words = []
with open(filename) as corp:
for l in corp:
l = ennormalisation(l)
words += l.strip().split()
words.reverse()
buf = []
while len(words):
buf.append(words.pop())
if buf[-1].endswith(".")\
or buf[-1].endswith("?")\
or buf[-1].endswith("!"):
yield buf
buf = []
def get_sentence_lines(filename):
"""
given filename, yield santised sentences as word list
Split on newline
"""
words = []
with open(filename) as corp:
for l in corp:
if l == "":
continue
l = ennormalisation(l)
words.append(l.strip().split())
while len(words):
buf = words.pop()
yield buf
import sys
import os
from willie.module import commands, rule
@commands('haiku')
def markov_haiku(bot, trigger):
"""
haiku
"""
try:
# bot.say(" ".join(bot.memory['markov'].generate_syllables(5)))
# bot.say(" ".join(bot.memory['markov'].generate_syllables(7)))
# bot.say(" ".join(bot.memory['markov'].generate_syllables(5)))
seed = ennormalisation(str(trigger.group()))
seed = seed.split()[1:]
for i in range(50):
ret = bot.memory['markov'].generate_syllables(17, seed=seed)
msg = []
for i in range(len(ret)):
if sum(count_syllables(w)[0] for w in ret[:i]) == 5:
msg.append(ret[:i])
# print sum(count_syllables(w)[0] for w in msg[0])
ret = ret[i:]
break
else:
continue
for i in range(len(ret)):
if sum(count_syllables(w)[0] for w in ret[:i]) == 7:
msg.append(ret[:i])
ret = ret[i:]
break
else:
continue
for i in range(len(ret)):
if sum(count_syllables(w)[0] for w in ret[:i]) == 5:
msg.append(ret[:])
ret = ret[i:]
break
else:
continue
bot.say(" ".join(msg[0]))
bot.say(" ".join(msg[1]))
bot.say(" ".join(msg[2]))
print "haiku spill:", ret
break
else:
bot.say("no soup for you")
except UnicodeEncodeError as e:
pass
@commands('ebooks')
def markov_excrete(bot, trigger):
"""
Ebooks
"""
try:
seed = ennormalisation(str(trigger.group()))
seed = seed.split()[1:]
print "seed:", seed
ret = bot.memory['markov'].generate_original(seed=seed, max_words=40)
if len(" ".join(ret)):
bot.say(" ".join(ret))
else:
bot.say("I shan't!")
except UnicodeEncodeError as e:
pass
@rule(r'(?!\.).*')
def markov_feed(bot, trigger):
try:
with open("filtered.log", 'a') as f:
f.write(str(trigger.group()) + "\n")
feed = ennormalisation(str(trigger.group()))
feed = feed.split()
print "markov feed:", feed
bot.memory['markov'].feed(feed)
except UnicodeEncodeError as e:
pass
def setup(bot):
bot.memory['markov'] = Markov(
get_sentence_lines("filtered.log"),
2)
def shutdown(bot):
del bot.memory['markov']
if __name__ == '__main__':
# print [count_syllables(w) for w in ['sick','he','is','so','amazing']]
# my_little_markov = Markov(get_sentence("bible.txt"), 3)
# my_little_markov = Markov(get_sentence("jimstone.txt"), 3)
# my_little_markov = Markov(get_sentence("timecube.txt"), 3)
my_little_markov = Markov(get_sentence_lines("filtered.log"), 3)
for i in range(50):
# words = my_little_markov.generate_original(seed=[str(u'human')])
# print " ".join(words)
# print my_little_markov.get_prob(words)
# print my_little_markov.is_in_corpus(words)
# print " ".join(my_little_markov.generate_syllables(5))
# print " ".join(my_little_markov.generate_syllables(7))
# print " ".join(my_little_markov.generate_syllables(5))
# print
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment