language-engineering/gist:3973277

## gistfile1.py
from sussex_nltk.parse import dep_parse_sentences_arceager       # Import the function which parses an iterable of sentences
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader  # Import the corpus reader
from nltk import pos_tag                                         # Import the PoS tagging function

# Create a list of reviews that contain the verb "to buy",
# by filtering for several of its conjugations

sentences = []
verb_variants = set(["buy","buys","bought"])
# You can use any product category (or even all product categories),
# but DVD is small and takes less time to process
for sentence in AmazonReviewCorpusReader().category("dvd").sents():
# Check for several variations of the verb. "&" finds the intersection
# of 2 sets (what they have in common). So the below statement says
# "if the set of sentence tokens has any token in common with our verbs,
# then keep the sentence"
    if verb_variants & set(sentence):
        sentences.append(sentence)  # Populate our list of sentences with
                                    # this sentence if it contains what
                                    # we're after.

# Optionally limit the number of reviews to *n* for ease of processing
# and observation. Or perform some kind of sampling (not shown)
n = 10
sentences = sentences[:n]  # Slice notation

#Create an iterable over dependency parsed sentences
# Round brackets create a generator instead of a list, see the link below
# the code for an explanation of generator expressions. This means that the
# sentences will only be PoS-tagged as and when the parser needs them, one
# at a time.
tagged_sents = (pos_tag(sentence) for sentence in sentences)
parsed_sents = dep_parse_sentences_arceager(tagged_sents)

# Now you can iterate over *parsed_sents* performing computations or printing
for sentence in parsed_sents:
    print "------ Sentence ------"
    print sentence
# This sentence is a ParsedSentence object, which is explained further in
# the next section.
	from sussex_nltk.parse import dep_parse_sentences_arceager # Import the function which parses an iterable of sentences
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader # Import the corpus reader
	from nltk import pos_tag # Import the PoS tagging function

	# Create a list of reviews that contain the verb "to buy",
	# by filtering for several of its conjugations

	sentences = []
	verb_variants = set(["buy","buys","bought"])
	# You can use any product category (or even all product categories),
	# but DVD is small and takes less time to process
	for sentence in AmazonReviewCorpusReader().category("dvd").sents():
	# Check for several variations of the verb. "&" finds the intersection
	# of 2 sets (what they have in common). So the below statement says
	# "if the set of sentence tokens has any token in common with our verbs,
	# then keep the sentence"
	if verb_variants & set(sentence):
	sentences.append(sentence) # Populate our list of sentences with
	# this sentence if it contains what
	# we're after.

	# Optionally limit the number of reviews to n for ease of processing
	# and observation. Or perform some kind of sampling (not shown)
	n = 10
	sentences = sentences[:n] # Slice notation

	#Create an iterable over dependency parsed sentences
	# Round brackets create a generator instead of a list, see the link below
	# the code for an explanation of generator expressions. This means that the
	# sentences will only be PoS-tagged as and when the parser needs them, one
	# at a time.
	tagged_sents = (pos_tag(sentence) for sentence in sentences)
	parsed_sents = dep_parse_sentences_arceager(tagged_sents)

	# Now you can iterate over parsed_sents performing computations or printing
	for sentence in parsed_sents:
	print "------ Sentence ------"
	print sentence
	# This sentence is a ParsedSentence object, which is explained further in
	# the next section.