Skip to content

Instantly share code, notes, and snippets.

Created December 1, 2013 10:12
Show Gist options
  • Save cheekybastard/7730902 to your computer and use it in GitHub Desktop.
Save cheekybastard/7730902 to your computer and use it in GitHub Desktop.
from collections import defaultdict
from random import randint
from random import random
import os
import itertools
import sys
# Settings
max_corpus_size = 20000
# Accept args for different dictionaries
dicts = sys.argv
# import multiple dictionaries
dir = os.path.dirname(__file__)
imported_titles = []
per_dictionary_limit = max_corpus_size / len(dicts)
for dictionary_name in dicts:
filename = os.path.join(dir, "dictionaries/" + dictionary_name + ".txt")
archive = open(filename)
dict_titles ="\n")
if len(dict_titles) > per_dictionary_limit:
window_start = randint(0,len(dict_titles) - per_dictionary_limit)
dict_titles = dict_titles[window_start:window_start+per_dictionary_limit]
imported_titles = imported_titles + dict_titles
titles = []
# lowercase everything, remove quotes (to prevent dangling quotes)
# we're also going to create a mapping back to the original case if it's a weird word (like iPad)
titlecase_mappings = {}
for title in imported_titles:
titles.append(title.lower().replace("\"", ""))
for original_word in title.replace("\"", "").split(" "):
titlecase_mappings[original_word.lower()] = original_word
markov_map = defaultdict(lambda:defaultdict(int))
lookback = 2
#Generate map in the form word1 -> word2 -> occurences of word2 after word1
for title in titles[:-1]:
title = title.split()
if len(title) > lookback:
for i in xrange(len(title)+1):
markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1
#Convert map to the word1 -> word2 -> probability of word2 after word1
for word, following in markov_map.items():
total = float(sum(following.values()))
for key in following:
following[key] /= total
#Typical sampling from a categorical distribution
def sample(items):
next_word = None
t = 0.0
for k, v in items:
t += v
if t and random() < v/t:
next_word = k
return next_word
def get_sentence(length_max=140):
while True:
sentence = []
next_word = sample(markov_map[''].items())
while next_word != '':
next_word = sample(markov_map[' '.join(sentence[-lookback:])].items())
sentence = ' '.join(sentence)
if any(sentence in title for title in titles):
continue #Prune titles that are substrings of actual titles
if len(sentence) > length_max:
return sentence
def retitlize_sentence(sentence):
lowercase_sentence = sentence.split(" ")
uppercase_sentence = []
for word in lowercase_sentence:
uppercase_sentence.append(titlecase_mappings.get(word, word))
return " ".join(uppercase_sentence)
for _ in itertools.repeat(None, 10):
print retitlize_sentence(get_sentence())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment