jorendorff/censored.py

## censored.py
""" censored.py - Randomly ######## ############## ################## #####
#### ########## ## ### Brown Corpus.

To run this program without affecting your Python install:

    mkdir mystery-program
    cd mystery-program
    wget https://gist.github.com/jorendorff/4987554/raw//censored.py
    virtualenv venv
    . venv/bin/activate
    pip install nltk
    python -c 'import nltk; nltk.download("brown")'
    python censored.py

"""

from nltk.corpus import brown
from nltk.probability import FreqDist
import random

# ----------------------------------------------------------------------------
# Algorithm for selecting k random elements from a weighted list.
# Cribbed from: http://stackoverflow.com/a/2149533/94977

class Node:
    # Each node in the heap has a weight, value, and total weight.
    # The total weight, self.tw, is self.w plus the weight of any children.
    __slots__ = ['w', 'v', 'tw']
    def __init__(self, w, v, tw):
        self.w, self.v, self.tw = w, v, tw

def rws_heap(items):
    # h is the heap. It's like a binary tree that lives in an array.
    # It has a Node for each pair in `items`. h[1] is the root. Each
    # other Node h[i] has a parent at h[i>>1]. Each node has up to 2
    # children, h[i<<1] and h[(i<<1)+1].  To get this nice simple
    # arithmetic, we have to leave h[0] vacant.
    h = [None]                          # leave h[0] vacant
    for w, v in items:
        h.append(Node(w, v, w))
    for i in range(len(h) - 1, 1, -1):  # total up the tws
        h[i>>1].tw += h[i].tw           # add h[i]'s total to its parent
    return h

def rws_heap_pop(h):
    gas = h[1].tw * random.random()     # start with a random amount of gas

    i = 1                     # start driving at the root
    while gas > h[i].w:       # while we have enough gas to get past node i:
        gas -= h[i].w         #   drive past node i
        i <<= 1               #   move to first child
        if gas > h[i].tw:     #   if we have enough gas:
            gas -= h[i].tw    #     drive past first child and descendants
            i += 1            #     move to second child
    w = h[i].w                # out of gas! h[i] is the selected node.
    v = h[i].v

    h[i].w = 0                # make sure this node isn't chosen again
    while i:                  # fix up total weights
        h[i].tw -= w
        i >>= 1
    return v

def random_weighted_sample_no_replacement(items, n):
    heap = rws_heap(items)              # just make a heap...
    for i in range(n):
        yield rws_heap_pop(heap)        # and pop n items off it.

# ----------------------------------------------------------------------------
# our code

def main():
    freq = FreqDist()
    for word in brown.words():
        freq.inc(word.lower())

    paras = []
    for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True):
        if len(para) > 4 and sum(len(sentence) for sentence in para) > 40:
            paras.append(para)
    para = random.choice(paras)

    tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number',
                     V='verb', VD='past tense verb', VG='-ing verb')

    items = []
    total_words = 0
    for i, sentence in enumerate(para):
        for j, (word, tag) in enumerate(sentence):
            total_words += 1
            if word.isalpha() and tag in tag_names:
                # We weight the word based on its rarity; we want to replace rare words.
                rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2
                items.append((rarity, (i, j)))
    blanks = min(total_words // 7, len(items))
    chosen = set(random_weighted_sample_no_replacement(items, blanks))

    results = []
    blank_labels = []
    for i, sentence in enumerate(para):
        for j, (word, tag) in enumerate(sentence):
            if (i, j) in chosen:
                results.append(None)
                blank_labels.append(tag_names[tag])
            else:
                results.append(word)

    width = 72
    blank = '_' * 15
    line1 = ''
    line2 = ''
    seen_blanks = 0
    space_before = ''
    for word in results:
        label = ''
        if word is None:
            word = blank
            label = blank_labels[seen_blanks]
            seen_blanks += 1
        else:
            if word in ('.', ',', '?', '!', ';', '--', "'", "''"):
                space_before = ''

        if len(line1) + len(space_before) + len(word) > width:
            print line1
            print line2.rstrip()
            print
            line1 = ''
            line2 = ''
            space_before = ''
        line1 += space_before + word
        line2 += space_before + label.center(len(word))

        if word in ('``', '`', '('):
            space_before = ''
        else:
            space_before = ' '

    if line1:
        print line1
        print line2.rstrip()
        print

if __name__ == '__main__':
    main()
	""" censored.py - Randomly ######## ############## ################## #####
	#### ########## ## ### Brown Corpus.

	To run this program without affecting your Python install:

	mkdir mystery-program
	cd mystery-program
	wget https://gist.github.com/jorendorff/4987554/raw//censored.py
	virtualenv venv
	. venv/bin/activate
	pip install nltk
	python -c 'import nltk; nltk.download("brown")'
	python censored.py

	"""

	from nltk.corpus import brown
	from nltk.probability import FreqDist
	import random

	# ----------------------------------------------------------------------------
	# Algorithm for selecting k random elements from a weighted list.
	# Cribbed from: http://stackoverflow.com/a/2149533/94977

	class Node:
	# Each node in the heap has a weight, value, and total weight.
	# The total weight, self.tw, is self.w plus the weight of any children.
	__slots__ = ['w', 'v', 'tw']
	def __init__(self, w, v, tw):
	self.w, self.v, self.tw = w, v, tw

	def rws_heap(items):
	# h is the heap. It's like a binary tree that lives in an array.
	# It has a Node for each pair in `items`. h[1] is the root. Each
	# other Node h[i] has a parent at h[i>>1]. Each node has up to 2
	# children, h[i<<1] and h[(i<<1)+1]. To get this nice simple
	# arithmetic, we have to leave h[0] vacant.
	h = [None] # leave h[0] vacant
	for w, v in items:
	h.append(Node(w, v, w))
	for i in range(len(h) - 1, 1, -1): # total up the tws
	h[i>>1].tw += h[i].tw # add h[i]'s total to its parent
	return h

	def rws_heap_pop(h):
	gas = h[1].tw * random.random() # start with a random amount of gas

	i = 1 # start driving at the root
	while gas > h[i].w: # while we have enough gas to get past node i:
	gas -= h[i].w # drive past node i
	i <<= 1 # move to first child
	if gas > h[i].tw: # if we have enough gas:
	gas -= h[i].tw # drive past first child and descendants
	i += 1 # move to second child
	w = h[i].w # out of gas! h[i] is the selected node.
	v = h[i].v

	h[i].w = 0 # make sure this node isn't chosen again
	while i: # fix up total weights
	h[i].tw -= w
	i >>= 1
	return v

	def random_weighted_sample_no_replacement(items, n):
	heap = rws_heap(items) # just make a heap...
	for i in range(n):
	yield rws_heap_pop(heap) # and pop n items off it.

	# ----------------------------------------------------------------------------
	# our code

	def main():
	freq = FreqDist()
	for word in brown.words():
	freq.inc(word.lower())

	paras = []
	for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True):
	if len(para) > 4 and sum(len(sentence) for sentence in para) > 40:
	paras.append(para)
	para = random.choice(paras)

	tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number',
	V='verb', VD='past tense verb', VG='-ing verb')

	items = []
	total_words = 0
	for i, sentence in enumerate(para):
	for j, (word, tag) in enumerate(sentence):
	total_words += 1
	if word.isalpha() and tag in tag_names:
	# We weight the word based on its rarity; we want to replace rare words.
	rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2
	items.append((rarity, (i, j)))
	blanks = min(total_words // 7, len(items))
	chosen = set(random_weighted_sample_no_replacement(items, blanks))

	results = []
	blank_labels = []
	for i, sentence in enumerate(para):
	for j, (word, tag) in enumerate(sentence):
	if (i, j) in chosen:
	results.append(None)
	blank_labels.append(tag_names[tag])
	else:
	results.append(word)

	width = 72
	blank = '_' * 15
	line1 = ''
	line2 = ''
	seen_blanks = 0
	space_before = ''
	for word in results:
	label = ''
	if word is None:
	word = blank
	label = blank_labels[seen_blanks]
	seen_blanks += 1
	else:
	if word in ('.', ',', '?', '!', ';', '--', "'", "''"):
	space_before = ''

	if len(line1) + len(space_before) + len(word) > width:
	print line1
	print line2.rstrip()
	print
	line1 = ''
	line2 = ''
	space_before = ''
	line1 += space_before + word
	line2 += space_before + label.center(len(word))

	if word in ('``', '`', '('):
	space_before = ''
	else:
	space_before = ' '

	if line1:
	print line1
	print line2.rstrip()
	print

	if __name__ == '__main__':
	main()