Last active
October 12, 2015 04:48
-
-
Save language-engineering/3973277 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.parse import dep_parse_sentences_arceager # Import the function which parses an iterable of sentences | |
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader # Import the corpus reader | |
from nltk import pos_tag # Import the PoS tagging function | |
# Create a list of reviews that contain the verb "to buy", | |
# by filtering for several of its conjugations | |
sentences = [] | |
verb_variants = set(["buy","buys","bought"]) | |
# You can use any product category (or even all product categories), | |
# but DVD is small and takes less time to process | |
for sentence in AmazonReviewCorpusReader().category("dvd").sents(): | |
# Check for several variations of the verb. "&" finds the intersection | |
# of 2 sets (what they have in common). So the below statement says | |
# "if the set of sentence tokens has any token in common with our verbs, | |
# then keep the sentence" | |
if verb_variants & set(sentence): | |
sentences.append(sentence) # Populate our list of sentences with | |
# this sentence if it contains what | |
# we're after. | |
# Optionally limit the number of reviews to *n* for ease of processing | |
# and observation. Or perform some kind of sampling (not shown) | |
n = 10 | |
sentences = sentences[:n] # Slice notation | |
#Create an iterable over dependency parsed sentences | |
# Round brackets create a generator instead of a list, see the link below | |
# the code for an explanation of generator expressions. This means that the | |
# sentences will only be PoS-tagged as and when the parser needs them, one | |
# at a time. | |
tagged_sents = (pos_tag(sentence) for sentence in sentences) | |
parsed_sents = dep_parse_sentences_arceager(tagged_sents) | |
# Now you can iterate over *parsed_sents* performing computations or printing | |
for sentence in parsed_sents: | |
print "------ Sentence ------" | |
print sentence | |
# This sentence is a ParsedSentence object, which is explained further in | |
# the next section. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment