lizadaly/example-bigrams.py

## example-bigrams.py
import nltk
from nltk.corpus import stopwords
from collections import Counter

word_list = []

# Set up a quick lookup table for common words like "the" and "an" so they can be excluded
stops = set(stopwords.words('english'))

# For all 18 novels in the public domain book corpus, extract all their words
[word_list.extend(nltk.corpus.gutenberg.words(f)) for f in nltk.corpus.gutenberg.fileids()]

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in word_list if w.isalnum()]

# Ask NLTK to generate a list of bigrams for the word "sun", excluding
# those words which are too common to be interesing
sun_bigrams = [b for b in nltk.bigrams(cleaned_words) if (b[0] == 'sun' or b[1] == 'sun') \
  and b[0] not in stops and b[1] not in stops]
	import nltk
	from nltk.corpus import stopwords
	from collections import Counter

	word_list = []

	# Set up a quick lookup table for common words like "the" and "an" so they can be excluded
	stops = set(stopwords.words('english'))

	# For all 18 novels in the public domain book corpus, extract all their words
	[word_list.extend(nltk.corpus.gutenberg.words(f)) for f in nltk.corpus.gutenberg.fileids()]

	# Filter out words that have punctuation and make everything lower-case
	cleaned_words = [w.lower() for w in word_list if w.isalnum()]

	# Ask NLTK to generate a list of bigrams for the word "sun", excluding
	# those words which are too common to be interesing
	sun_bigrams = [b for b in nltk.bigrams(cleaned_words) if (b[0] == 'sun' or b[1] == 'sun') \
	and b[0] not in stops and b[1] not in stops]