jorendorff/adlib.py

## adlib.py
""" adlib.py - Randomly generate Mad-Libs-style fill-in-the-blanks games
from paragraphs in the Brown Corpus.

To use this without affecting your Python install:

    mkdir adlib
    cd adlib
    wget https://gist.github.com/jorendorff/4987754/raw//adlib.py
    virtualenv venv
    . venv/bin/activate
    pip install nltk
    python -c 'import nltk; nltk.download("brown")'
    python adlib.py

"""

import nltk

from nltk.corpus import brown
from nltk.probability import FreqDist
import random

# ----------------------------------------------------------------------------
# Algorithm for selecting k random elements from a weighted list.
# Cribbed from: http://stackoverflow.com/a/2149533/94977

class Node:
    # Each node in the heap has a weight, value, and total weight.
    # The total weight, self.tw, is self.w plus the weight of any children.
    __slots__ = ['w', 'v', 'tw']
    def __init__(self, w, v, tw):
        self.w, self.v, self.tw = w, v, tw

def rws_heap(items):
    # h is the heap. It's like a binary tree that lives in an array.
    # It has a Node for each pair in `items`. h[1] is the root. Each
    # other Node h[i] has a parent at h[i>>1]. Each node has up to 2
    # children, h[i<<1] and h[(i<<1)+1].  To get this nice simple
    # arithmetic, we have to leave h[0] vacant.
    h = [None]                          # leave h[0] vacant
    for w, v in items:
        h.append(Node(w, v, w))
    for i in range(len(h) - 1, 1, -1):  # total up the tws
        h[i>>1].tw += h[i].tw           # add h[i]'s total to its parent
    return h

def rws_heap_pop(h):
    gas = h[1].tw * random.random()     # start with a random amount of gas

    i = 1                     # start driving at the root
    while gas > h[i].w:       # while we have enough gas to get past node i:
        gas -= h[i].w         #   drive past node i
        i <<= 1               #   move to first child
        if gas > h[i].tw:     #   if we have enough gas:
            gas -= h[i].tw    #     drive past first child and descendants
            i += 1            #     move to second child
    w = h[i].w                # out of gas! h[i] is the selected node.
    v = h[i].v

    h[i].w = 0                # make sure this node isn't chosen again
    while i:                  # fix up total weights
        h[i].tw -= w
        i >>= 1
    return v

def random_weighted_sample_no_replacement(items, n):
    heap = rws_heap(items)              # just make a heap...
    for i in range(n):
        yield rws_heap_pop(heap)        # and pop n items off it.

# ----------------------------------------------------------------------------
# our code

def adlib():
    # Count how often each word appears in the corpus.
    freq = FreqDist()
    for word in brown.words():
        freq.inc(word.lower())

    # Choose a paragraph randomly from the corpus,
    # making sure to eliminate short paragraphs.
    # For fun, we'll focus on science fiction.
    paras = []
    for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True):
        if len(para) > 4 and sum(len(sentence) for sentence in para) > 40:
            paras.append(para)
    para = random.choice(paras)

    # Explanations of the tags that we want to replace.
    # (Never replace a preposition or a conjunction, for example;
    # those are no fun because they are such small categories.)
    # Simplified tags are a little too simple for our purpose,
    # because singular and plural nouns get the same tag, N,
    # but whatever.
    tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number',
                     V='verb', VD='past tense verb', VG='-ing verb')

    # Now choose some random words to replace.
    # The output of this step is a set of pairs of ints:
    #   (sentence_index, word_index)
    # --the coordinates of the chosen words within para.
    #
    # It would be easy to choose the words completely at random,
    # but for our purposes, we wish to choose
    #
    # We do this using the "random weighted sample" code from
    #   http://stackoverflow.com/a/2149533/94977
    # which requires us to build a list of pairs (weight, data)
    # where `data` is our coordinate pair.
    #
    items = []
    total_words = 0
    for i, sentence in enumerate(para):
        for j, (word, tag) in enumerate(sentence):
            total_words += 1
            if word.isalpha() and tag in tag_names:
                # We weight the word based on its rarity; we want to replace rare words.
                rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2
                items.append((rarity, (i, j)))
    blanks = min(total_words // 7, len(items))
    chosen = set(random_weighted_sample_no_replacement(items, blanks))

    # Make a list of output words, replacing the chosen words with None.
    results = []
    blank_labels = []
    for i, sentence in enumerate(para):
        for j, (word, tag) in enumerate(sentence):
            if (i, j) in chosen:
                results.append(None)
                blank_labels.append(tag_names[tag])
            else:
                results.append(word)

    # Print the results. Ordinarily you would use textwrap for this, but it
    # doesn't quite apply here due to the blanks; also we need some extra
    # effort to omit spaces appropriately around certain punctuation marks.
    width = 72
    blank = '_' * 15
    line1 = ''
    line2 = ''
    seen_blanks = 0
    space_before = ''
    for word in results:
        label = ''
        if word is None:
            word = blank
            label = blank_labels[seen_blanks]
            seen_blanks += 1
        else:
            if word in ('.', ',', '?', '!', ';', '--', "'", "''"):
                space_before = ''

        if len(line1) + len(space_before) + len(word) > width:
            print line1
            print line2.rstrip()
            print
            line1 = ''
            line2 = ''
            space_before = ''
        line1 += space_before + word
        line2 += space_before + label.center(len(word))

        if word in ('``', '`', '('):
            space_before = ''
        else:
            space_before = ' '

    if line1:
        print line1
        print line2.rstrip()
        print

if __name__ == '__main__':
    adlib()
	""" adlib.py - Randomly generate Mad-Libs-style fill-in-the-blanks games
	from paragraphs in the Brown Corpus.

	To use this without affecting your Python install:

	mkdir adlib
	cd adlib
	wget https://gist.github.com/jorendorff/4987754/raw//adlib.py
	virtualenv venv
	. venv/bin/activate
	pip install nltk
	python -c 'import nltk; nltk.download("brown")'
	python adlib.py

	"""

	import nltk

	from nltk.corpus import brown
	from nltk.probability import FreqDist
	import random

	# ----------------------------------------------------------------------------
	# Algorithm for selecting k random elements from a weighted list.
	# Cribbed from: http://stackoverflow.com/a/2149533/94977

	class Node:
	# Each node in the heap has a weight, value, and total weight.
	# The total weight, self.tw, is self.w plus the weight of any children.
	__slots__ = ['w', 'v', 'tw']
	def __init__(self, w, v, tw):
	self.w, self.v, self.tw = w, v, tw

	def rws_heap(items):
	# h is the heap. It's like a binary tree that lives in an array.
	# It has a Node for each pair in `items`. h[1] is the root. Each
	# other Node h[i] has a parent at h[i>>1]. Each node has up to 2
	# children, h[i<<1] and h[(i<<1)+1]. To get this nice simple
	# arithmetic, we have to leave h[0] vacant.
	h = [None] # leave h[0] vacant
	for w, v in items:
	h.append(Node(w, v, w))
	for i in range(len(h) - 1, 1, -1): # total up the tws
	h[i>>1].tw += h[i].tw # add h[i]'s total to its parent
	return h

	def rws_heap_pop(h):
	gas = h[1].tw * random.random() # start with a random amount of gas

	i = 1 # start driving at the root
	while gas > h[i].w: # while we have enough gas to get past node i:
	gas -= h[i].w # drive past node i
	i <<= 1 # move to first child
	if gas > h[i].tw: # if we have enough gas:
	gas -= h[i].tw # drive past first child and descendants
	i += 1 # move to second child
	w = h[i].w # out of gas! h[i] is the selected node.
	v = h[i].v

	h[i].w = 0 # make sure this node isn't chosen again
	while i: # fix up total weights
	h[i].tw -= w
	i >>= 1
	return v

	def random_weighted_sample_no_replacement(items, n):
	heap = rws_heap(items) # just make a heap...
	for i in range(n):
	yield rws_heap_pop(heap) # and pop n items off it.

	# ----------------------------------------------------------------------------
	# our code

	def adlib():
	# Count how often each word appears in the corpus.
	freq = FreqDist()
	for word in brown.words():
	freq.inc(word.lower())

	# Choose a paragraph randomly from the corpus,
	# making sure to eliminate short paragraphs.
	# For fun, we'll focus on science fiction.
	paras = []
	for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True):
	if len(para) > 4 and sum(len(sentence) for sentence in para) > 40:
	paras.append(para)
	para = random.choice(paras)

	# Explanations of the tags that we want to replace.
	# (Never replace a preposition or a conjunction, for example;
	# those are no fun because they are such small categories.)
	# Simplified tags are a little too simple for our purpose,
	# because singular and plural nouns get the same tag, N,
	# but whatever.
	tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number',
	V='verb', VD='past tense verb', VG='-ing verb')

	# Now choose some random words to replace.
	# The output of this step is a set of pairs of ints:
	# (sentence_index, word_index)
	# --the coordinates of the chosen words within para.
	#
	# It would be easy to choose the words completely at random,
	# but for our purposes, we wish to choose
	#
	# We do this using the "random weighted sample" code from
	# http://stackoverflow.com/a/2149533/94977
	# which requires us to build a list of pairs (weight, data)
	# where `data` is our coordinate pair.
	#
	items = []
	total_words = 0
	for i, sentence in enumerate(para):
	for j, (word, tag) in enumerate(sentence):
	total_words += 1
	if word.isalpha() and tag in tag_names:
	# We weight the word based on its rarity; we want to replace rare words.
	rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2
	items.append((rarity, (i, j)))
	blanks = min(total_words // 7, len(items))
	chosen = set(random_weighted_sample_no_replacement(items, blanks))

	# Make a list of output words, replacing the chosen words with None.
	results = []
	blank_labels = []
	for i, sentence in enumerate(para):
	for j, (word, tag) in enumerate(sentence):
	if (i, j) in chosen:
	results.append(None)
	blank_labels.append(tag_names[tag])
	else:
	results.append(word)

	# Print the results. Ordinarily you would use textwrap for this, but it
	# doesn't quite apply here due to the blanks; also we need some extra
	# effort to omit spaces appropriately around certain punctuation marks.
	width = 72
	blank = '_' * 15
	line1 = ''
	line2 = ''
	seen_blanks = 0
	space_before = ''
	for word in results:
	label = ''
	if word is None:
	word = blank
	label = blank_labels[seen_blanks]
	seen_blanks += 1
	else:
	if word in ('.', ',', '?', '!', ';', '--', "'", "''"):
	space_before = ''

	if len(line1) + len(space_before) + len(word) > width:
	print line1
	print line2.rstrip()
	print
	line1 = ''
	line2 = ''
	space_before = ''
	line1 += space_before + word
	line2 += space_before + label.center(len(word))

	if word in ('``', '`', '('):
	space_before = ''
	else:
	space_before = ' '

	if line1:
	print line1
	print line2.rstrip()
	print

	if __name__ == '__main__':
	adlib()