Skip to content

Instantly share code, notes, and snippets.

# jorendorff/censored.py Last active Dec 13, 2015

Full script, minus comments, of the mystery program in this blog post: http://pynash.org/2013/02/19/English-playground.html
 """ censored.py - Randomly ######## ############## ################## ##### #### ########## ## ### Brown Corpus. To run this program without affecting your Python install: mkdir mystery-program cd mystery-program wget https://gist.github.com/jorendorff/4987554/raw//censored.py virtualenv venv . venv/bin/activate pip install nltk python -c 'import nltk; nltk.download("brown")' python censored.py """ from nltk.corpus import brown from nltk.probability import FreqDist import random # ---------------------------------------------------------------------------- # Algorithm for selecting k random elements from a weighted list. # Cribbed from: http://stackoverflow.com/a/2149533/94977 class Node: # Each node in the heap has a weight, value, and total weight. # The total weight, self.tw, is self.w plus the weight of any children. __slots__ = ['w', 'v', 'tw'] def __init__(self, w, v, tw): self.w, self.v, self.tw = w, v, tw def rws_heap(items): # h is the heap. It's like a binary tree that lives in an array. # It has a Node for each pair in `items`. h is the root. Each # other Node h[i] has a parent at h[i>>1]. Each node has up to 2 # children, h[i<<1] and h[(i<<1)+1]. To get this nice simple # arithmetic, we have to leave h vacant. h = [None] # leave h vacant for w, v in items: h.append(Node(w, v, w)) for i in range(len(h) - 1, 1, -1): # total up the tws h[i>>1].tw += h[i].tw # add h[i]'s total to its parent return h def rws_heap_pop(h): gas = h.tw * random.random() # start with a random amount of gas i = 1 # start driving at the root while gas > h[i].w: # while we have enough gas to get past node i: gas -= h[i].w # drive past node i i <<= 1 # move to first child if gas > h[i].tw: # if we have enough gas: gas -= h[i].tw # drive past first child and descendants i += 1 # move to second child w = h[i].w # out of gas! h[i] is the selected node. v = h[i].v h[i].w = 0 # make sure this node isn't chosen again while i: # fix up total weights h[i].tw -= w i >>= 1 return v def random_weighted_sample_no_replacement(items, n): heap = rws_heap(items) # just make a heap... for i in range(n): yield rws_heap_pop(heap) # and pop n items off it. # ---------------------------------------------------------------------------- # our code def main(): freq = FreqDist() for word in brown.words(): freq.inc(word.lower()) paras = [] for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True): if len(para) > 4 and sum(len(sentence) for sentence in para) > 40: paras.append(para) para = random.choice(paras) tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number', V='verb', VD='past tense verb', VG='-ing verb') items = [] total_words = 0 for i, sentence in enumerate(para): for j, (word, tag) in enumerate(sentence): total_words += 1 if word.isalpha() and tag in tag_names: # We weight the word based on its rarity; we want to replace rare words. rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2 items.append((rarity, (i, j))) blanks = min(total_words // 7, len(items)) chosen = set(random_weighted_sample_no_replacement(items, blanks)) results = [] blank_labels = [] for i, sentence in enumerate(para): for j, (word, tag) in enumerate(sentence): if (i, j) in chosen: results.append(None) blank_labels.append(tag_names[tag]) else: results.append(word) width = 72 blank = '_' * 15 line1 = '' line2 = '' seen_blanks = 0 space_before = '' for word in results: label = '' if word is None: word = blank label = blank_labels[seen_blanks] seen_blanks += 1 else: if word in ('.', ',', '?', '!', ';', '--', "'", "''"): space_before = '' if len(line1) + len(space_before) + len(word) > width: print line1 print line2.rstrip() print line1 = '' line2 = '' space_before = '' line1 += space_before + word line2 += space_before + label.center(len(word)) if word in ('``', '`', '('): space_before = '' else: space_before = ' ' if line1: print line1 print line2.rstrip() print if __name__ == '__main__': main()
to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.