Skip to content

Instantly share code, notes, and snippets.

@Cairnarvon
Created July 1, 2012 02:23
Show Gist options
  • Save Cairnarvon/3026525 to your computer and use it in GitHub Desktop.
Save Cairnarvon/3026525 to your computer and use it in GitHub Desktop.
Tired of answering Formspring questions? Let Andrey Markov do it for you.
#!/usr/bin/python
import collections
import random
import sqlite3
import sys
try:
from nltk.tokenize import sent_tokenize
except ImportError:
import re
def sent_tokenize(paragraph):
return re.split('[.!?]', paragraph)
class Markov(object):
def __init__(self, corpus, splitter=unicode.split):
self.state = collections.defaultdict(lambda: collections.defaultdict(list))
w1 = w2 = "\n"
for word in splitter(corpus):
self.state[w1][w2].append(word)
w1, w2 = w2, word
self.state[w1][w2].append("\n")
def __call__(self, paras=[1]):
ret = []
w1 = w2 = "\n"
for sens in paras:
para = []
while sens > 0:
t = random.choice(self.state[w1][w2])
para.append(t)
if t[-1] in '.!?':
sens -= 1
w1, w2 = w2, t
ret.append(" ".join(para))
return "\n\n".join(ret)
def analyze(db):
paras, sens, corpus = [], [], []
conn = sqlite3.connect(db)
cur = conn.cursor()
cur.execute('select answer from questions;')
for answer, in cur.fetchall():
if answer is None:
# Apparently this does happen
continue
corpus.append(answer)
p = answer.split('\n\n')
paras.append(len(p))
for para in p:
sens.append(len(sent_tokenize(para)))
conn.close()
random.shuffle(corpus)
return "\n".join(corpus), [random.choice(sens) for x in range(random.choice(paras))]
if __name__ == '__main__':
if len(sys.argv) != 2:
sys.stdout = sys.stderr
print "Usage: %s __FILENAME__" % sys.argv[0]
print " where __FILENAME__ is a formspringscrape database"
print " http://code.google.com/p/formspringscrape/"
sys.exit(1)
corpus, spread = analyze(sys.argv[1])
print Markov(corpus)(spread)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment