Created
September 2, 2015 04:47
-
-
Save EvanTheB/1fc685988cd791d3405f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import string | |
# refs: | |
# http://eflorenzano.com/blog/2008/11/17/writing-markov-chain-irc-bot-twisted-and-python/ | |
# http://www.ccel.org/ccel/bible/kjv.txt | |
END = "END" | |
# https://github.com/akkana/scripts/blob/master/countsyl | |
verbose = False | |
def count_syllables(word): | |
if word.isdigit(): | |
return len(word), len(word) | |
# wrowl alt | |
return sum([x in ''.join([x for idx, x in enumerate(word) if idx == 0 or word[ | |
idx - 1] != word[idx]]).rstrip('e') for x in list('aeiouy')]), 0 | |
vowels = ['a', 'e', 'i', 'o', 'u'] | |
on_vowel = False | |
in_diphthong = False | |
minsyl = 0 | |
maxsyl = 0 | |
lastchar = None | |
word = word.lower() | |
for c in word: | |
is_vowel = c in vowels | |
if on_vowel is None: | |
on_vowel = is_vowel | |
# y is a special case | |
if c == 'y': | |
is_vowel = not on_vowel | |
if is_vowel: | |
if verbose: | |
print c, "is a vowel" | |
if not on_vowel: | |
# We weren't on a vowel before. | |
# Seeing a new vowel bumps the syllable count. | |
if verbose: | |
print "new syllable" | |
minsyl += 1 | |
maxsyl += 1 | |
elif on_vowel and not in_diphthong and c != lastchar: | |
# We were already in a vowel. | |
# Don't increment anything except the max count, | |
# and only do that once per diphthong. | |
if verbose: | |
print c, "is a diphthong" | |
in_diphthong = True | |
maxsyl += 1 | |
elif verbose: | |
print "[consonant]" | |
on_vowel = is_vowel | |
lastchar = c | |
# Some special cases: | |
if word[-1] == 'e': | |
minsyl -= 1 | |
# if it ended with a consonant followed by y, count that as a syllable. | |
if word[-1] == 'y' and not on_vowel: | |
maxsyl += 1 | |
return minsyl, maxsyl | |
class Markov(object): | |
"""docstring for Markov""" | |
def __init__(self, corpus, chain_len): | |
self.corpus = [] | |
self.flat_corpus = "" | |
self.markov = {} | |
self.chain_len = chain_len | |
for s in corpus: | |
self.feed(s) | |
def feed(self, msg): | |
def add(key, val): | |
self.markov.setdefault(key, []).append(val) | |
self.corpus += msg | |
self.flat_corpus += "".join(msg) | |
buf = [] | |
for word in msg: | |
add(tuple(buf), word) | |
buf.append(word) | |
if len(buf) > self.chain_len: | |
del buf[0] | |
add(tuple(buf), END) | |
def generate(self, seed=None, max_words=100): | |
seed = [] if not seed else seed | |
out = [] | |
# look for the seed in the dict by excluding head | |
while seed: | |
if tuple(seed) in self.markov: | |
break | |
out.append(seed.pop(0)) | |
if len(seed): | |
out += list(seed) | |
else: | |
out = [] | |
buf = list(seed) | |
for i in xrange(max_words): | |
next_word = random.choice(self.markov[tuple(buf)]) | |
# print "B", buf | |
# print "C", len(markov[tuple(buf)]) | |
# print "M", markov[tuple(buf)] | |
if next_word == END: | |
break | |
buf.append(next_word) | |
if len(buf) > self.chain_len: | |
del buf[0] | |
out.append(next_word) | |
return out | |
def generate_original(self, seed=None, max_words=100, min_words=0): | |
for i in range(1000): | |
s = self.generate(seed=seed, max_words=max_words) | |
if not self.is_in_corpus(s) and len(s) > min_words: | |
return s | |
print "not original:", s | |
return [] | |
def generate_syllables(self, syllables, seed=None): | |
for i in range(1000): | |
s = self.generate(seed=seed) | |
if not self.is_in_corpus(s) and \ | |
sum(count_syllables(w)[0] for w in s) == syllables: | |
# print s, [count_syllables(w)[0] for w in s] | |
return s | |
return [] | |
def is_in_corpus(self, words): | |
# print words | |
sub = "".join(words) | |
return sub in self.flat_corpus | |
def get_prob(self, words): | |
words = list(words) | |
words.reverse() | |
buf = [] | |
den = num = 1. | |
while len(words): | |
next_word = words.pop() | |
den *= len([True for w in self.markov[tuple(buf)] | |
if w is next_word]) | |
num *= len(self.markov[tuple(buf)]) | |
buf.append(next_word) | |
if len(buf) > self.chain_len: | |
del buf[0] | |
return den / num | |
punct = "\"#$%&()*+,-/:;<=>@[\\]^_`{|}~" | |
trans_table = string.maketrans(punct, " " * len(punct)) | |
def ennormalisation(strang): | |
return strang.lower().translate(trans_table) | |
def get_sentence(filename): | |
""" | |
given filename, yield santised sentences as word list | |
Split on .?! | |
""" | |
words = [] | |
with open(filename) as corp: | |
for l in corp: | |
l = ennormalisation(l) | |
words += l.strip().split() | |
words.reverse() | |
buf = [] | |
while len(words): | |
buf.append(words.pop()) | |
if buf[-1].endswith(".")\ | |
or buf[-1].endswith("?")\ | |
or buf[-1].endswith("!"): | |
yield buf | |
buf = [] | |
def get_sentence_lines(filename): | |
""" | |
given filename, yield santised sentences as word list | |
Split on newline | |
""" | |
words = [] | |
with open(filename) as corp: | |
for l in corp: | |
if l == "": | |
continue | |
l = ennormalisation(l) | |
words.append(l.strip().split()) | |
while len(words): | |
buf = words.pop() | |
yield buf | |
import sys | |
import os | |
from willie.module import commands, rule | |
@commands('haiku') | |
def markov_haiku(bot, trigger): | |
""" | |
haiku | |
""" | |
try: | |
# bot.say(" ".join(bot.memory['markov'].generate_syllables(5))) | |
# bot.say(" ".join(bot.memory['markov'].generate_syllables(7))) | |
# bot.say(" ".join(bot.memory['markov'].generate_syllables(5))) | |
seed = ennormalisation(str(trigger.group())) | |
seed = seed.split()[1:] | |
for i in range(50): | |
ret = bot.memory['markov'].generate_syllables(17, seed=seed) | |
msg = [] | |
for i in range(len(ret)): | |
if sum(count_syllables(w)[0] for w in ret[:i]) == 5: | |
msg.append(ret[:i]) | |
# print sum(count_syllables(w)[0] for w in msg[0]) | |
ret = ret[i:] | |
break | |
else: | |
continue | |
for i in range(len(ret)): | |
if sum(count_syllables(w)[0] for w in ret[:i]) == 7: | |
msg.append(ret[:i]) | |
ret = ret[i:] | |
break | |
else: | |
continue | |
for i in range(len(ret)): | |
if sum(count_syllables(w)[0] for w in ret[:i]) == 5: | |
msg.append(ret[:]) | |
ret = ret[i:] | |
break | |
else: | |
continue | |
bot.say(" ".join(msg[0])) | |
bot.say(" ".join(msg[1])) | |
bot.say(" ".join(msg[2])) | |
print "haiku spill:", ret | |
break | |
else: | |
bot.say("no soup for you") | |
except UnicodeEncodeError as e: | |
pass | |
@commands('ebooks') | |
def markov_excrete(bot, trigger): | |
""" | |
Ebooks | |
""" | |
try: | |
seed = ennormalisation(str(trigger.group())) | |
seed = seed.split()[1:] | |
print "seed:", seed | |
ret = bot.memory['markov'].generate_original(seed=seed, max_words=40) | |
if len(" ".join(ret)): | |
bot.say(" ".join(ret)) | |
else: | |
bot.say("I shan't!") | |
except UnicodeEncodeError as e: | |
pass | |
@rule(r'(?!\.).*') | |
def markov_feed(bot, trigger): | |
try: | |
with open("filtered.log", 'a') as f: | |
f.write(str(trigger.group()) + "\n") | |
feed = ennormalisation(str(trigger.group())) | |
feed = feed.split() | |
print "markov feed:", feed | |
bot.memory['markov'].feed(feed) | |
except UnicodeEncodeError as e: | |
pass | |
def setup(bot): | |
bot.memory['markov'] = Markov( | |
get_sentence_lines("filtered.log"), | |
2) | |
def shutdown(bot): | |
del bot.memory['markov'] | |
if __name__ == '__main__': | |
# print [count_syllables(w) for w in ['sick','he','is','so','amazing']] | |
# my_little_markov = Markov(get_sentence("bible.txt"), 3) | |
# my_little_markov = Markov(get_sentence("jimstone.txt"), 3) | |
# my_little_markov = Markov(get_sentence("timecube.txt"), 3) | |
my_little_markov = Markov(get_sentence_lines("filtered.log"), 3) | |
for i in range(50): | |
# words = my_little_markov.generate_original(seed=[str(u'human')]) | |
# print " ".join(words) | |
# print my_little_markov.get_prob(words) | |
# print my_little_markov.is_in_corpus(words) | |
# print " ".join(my_little_markov.generate_syllables(5)) | |
# print " ".join(my_little_markov.generate_syllables(7)) | |
# print " ".join(my_little_markov.generate_syllables(5)) | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment