nachowski/hngen.py

## hngen.py
import urllib2
import re
import sys
from collections import defaultdict
from random import random
import json
from collections import namedtuple

"""
Introducing: Face-Smash!

Create a file called statuses.json with the following data:
https://developers.facebook.com/tools/explorer/145634995501895/?method=GET&path=me%2Fstatuses%3Ffields%3Dmessage%26limit%3D400

You can alternatively call the Graph API directly using
https://graph.facebook.com/me/statuses?fields=message&limit=400

Based on this wonderful piece of code:
https://gist.github.com/grantslatton/7694811
"""

archive = open("statuses.json")

data = json.loads(archive.read())[u'data']
titles = []
for e in data:
    status = e[u'message']
    titles.append(status)

archive.close()
markov_map = defaultdict(lambda:defaultdict(int))

lookback = 2

#Generate map in the form word1 -> word2 -> occurences of word2 after word1
for title in titles[:-1]:
    title = title.split()
    if len(title) > lookback:
        for i in xrange(len(title)+1):
            markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1

#Convert map to the word1 -> word2 -> probability of word2 after word1
for word, following in markov_map.items():
    total = float(sum(following.values()))
    for key in following:
        following[key] /= total

#Typical sampling from a categorical distribution
def sample(items):
    next_word = None
    t = 0.0
    for k, v in items:
        t += v
        if t and random() < v/t:
            next_word = k
    return next_word

sentences = []
while len(sentences) < 100:
    sentence = []
    next_word = sample(markov_map[''].items())
    while next_word != '':
        sentence.append(next_word)
        next_word = sample(markov_map[' '.join(sentence[-lookback:])].items())
    sentence = ' '.join(sentence)
    flag = True
    for title in titles: #Prune titles that are substrings of actual titles
        if sentence in title:
            flag = False
            break
    if flag:
        sentences.append(sentence)

for sentence in sentences:
    print (sentence + '\n').encode(sys.stdout.encoding, errors='replace')
	import urllib2
	import re
	import sys
	from collections import defaultdict
	from random import random
	import json
	from collections import namedtuple

	"""
	Introducing: Face-Smash!

	Create a file called statuses.json with the following data:
	https://developers.facebook.com/tools/explorer/145634995501895/?method=GET&path=me%2Fstatuses%3Ffields%3Dmessage%26limit%3D400

	You can alternatively call the Graph API directly using
	https://graph.facebook.com/me/statuses?fields=message&limit=400

	Based on this wonderful piece of code:
	https://gist.github.com/grantslatton/7694811
	"""

	archive = open("statuses.json")

	data = json.loads(archive.read())[u'data']
	titles = []
	for e in data:
	status = e[u'message']
	titles.append(status)

	archive.close()
	markov_map = defaultdict(lambda:defaultdict(int))

	lookback = 2

	#Generate map in the form word1 -> word2 -> occurences of word2 after word1
	for title in titles[:-1]:
	title = title.split()
	if len(title) > lookback:
	for i in xrange(len(title)+1):
	markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1

	#Convert map to the word1 -> word2 -> probability of word2 after word1
	for word, following in markov_map.items():
	total = float(sum(following.values()))
	for key in following:
	following[key] /= total

	#Typical sampling from a categorical distribution
	def sample(items):
	next_word = None
	t = 0.0
	for k, v in items:
	t += v
	if t and random() < v/t:
	next_word = k
	return next_word

	sentences = []
	while len(sentences) < 100:
	sentence = []
	next_word = sample(markov_map[''].items())
	while next_word != '':
	sentence.append(next_word)
	next_word = sample(markov_map[' '.join(sentence[-lookback:])].items())
	sentence = ' '.join(sentence)
	flag = True
	for title in titles: #Prune titles that are substrings of actual titles
	if sentence in title:
	flag = False
	break
	if flag:
	sentences.append(sentence)

	for sentence in sentences:
	print (sentence + '\n').encode(sys.stdout.encoding, errors='replace')