joyrexus/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Generate a list of word pairs from Moby Dick.
Taken from Matt Brocklin's post on text benchmarks.
Nice use of pipe and sliding_window here!
Install PyToolz before running: pip install toolz.

  
## make_pairs.py
'''
Generate a list of word pairs from Moby Dick.

Taken from Matt Brocklin's post on text benchmarks.
http://matthewrocklin.com/blog/work/2014/01/13/Text-Benchmarks/

'''
from toolz.curried import *
import urllib
import os

def stem(word):
    '''
    Stem word to primitive form

    '''
    return word.lower().rstrip(",.!:;'-\"").lstrip("'\"")

filename = 'moby-dick.txt'

# Get it from the internet if you don't have it already
if not os.path.exists(filename):
    with open(filename, 'w') as f:
        text = urllib.urlopen('http://www.gutenberg.org/ebooks/2701.txt.utf-8')
        f.write(text.read())

with open('pairs.txt', 'w') as f:
    pipe(filename, open,                # Open file for reading
                   drop(112),           # Drop 112 line header
                   map(str.split),      # Split each line into words
                   concat,              # Join all lists of words to single list
                   map(stem),           # Stem each word
                   sliding_window(2),   # Consider each consecutive pair
                   map(','.join),       # Join each pair with a comma
                   '\n'.join,           # Join all of the pairs with endlines
                   f.write)             # write to file
	'''
	Generate a list of word pairs from Moby Dick.

	Taken from Matt Brocklin's post on text benchmarks.
	http://matthewrocklin.com/blog/work/2014/01/13/Text-Benchmarks/

	'''
	from toolz.curried import *
	import urllib
	import os

	def stem(word):
	'''
	Stem word to primitive form

	'''
	return word.lower().rstrip(",.!:;'-\"").lstrip("'\"")

	filename = 'moby-dick.txt'

	# Get it from the internet if you don't have it already
	if not os.path.exists(filename):
	with open(filename, 'w') as f:
	text = urllib.urlopen('http://www.gutenberg.org/ebooks/2701.txt.utf-8')
	f.write(text.read())

	with open('pairs.txt', 'w') as f:
	pipe(filename, open, # Open file for reading
	drop(112), # Drop 112 line header
	map(str.split), # Split each line into words
	concat, # Join all lists of words to single list
	map(stem), # Stem each word
	sliding_window(2), # Consider each consecutive pair
	map(','.join), # Join each pair with a comma
	'\n'.join, # Join all of the pairs with endlines
	f.write) # write to file