Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sussex_nltk.parse import dep_parse_sentences_arceager
sentences = ["This is the first example sentence",
"This is the second example sentence",
"This is the third example sentence"]
parsed_sents = dep_parse_sentences_arceager(pos_tag(word_tokenize(sentence)) for sentence in sentences)
from sussex_nltk.parse import load_parsed_example_sentences
parsed_example_sentences = load_parsed_example_sentences()
# To inspect the sentences, you could print them straight out
for parsed_sentence in parsed_example_sentences:
print "--- Sentence ---"
print parsed_sentence
Input:
query_token = The token we're interested in. It is part of a dependency tree of tokens.
Output:
opinions = An array of token forms (which constitute the extracted opinions related to the query token
opinions = []
for each dependent token of query_token, then
append dependent's form to opinions array if the dependency relation of the dependent is "det"
def opinion_extractor(aspect_token, parsed_sentence):
# Your function will have 3 steps:
# i. Initialise a list of opinions
opinions = []
# ii. Find opinions (as an example we get all the dependants of the aspect token that have the relation "det")
opinions += [dependant.form for dependant in parsed_sentence.get_dependants(aspect_token) if dependant.deprel == "det"]
# You can continue to add to "opinions". Remember you can get the head of a token, and filter by PoS tag or Deprel too!
from sussex_nltk.parse import load_parsed_dvd_sentences, load_parsed_example_sentences
aspect = "plot" # Set this to the aspect token you're interested in
save_file_path = r"/path/to/savefile.txt" # Set this to the location of the file you wish to create/overwrite with the saved output.
# Tracking these numbers will allow us to see what proportion of sentences we discovered features in
sentences_with_discovered_features = 0 # Number of sentences we discovered features in
total_sentences = 0 # Total number of sentences
# This is a "with statement", it invokes a context manager, which handles the opening and closing of resources (like files)
# Say for example we acquire a list of BasicToken objects by getting all the dependants of a token:
dependants = parsed_sentence.get_dependants(aspect_token)
# We could filter that list, keeping only those tokens whose dependency relations with the aspect token are "dobj", by doing the following:
dependants = [token for token in dependants if token.deprel == "dobj"]
# Or we could filter that list, keeping only those tokens whose PoS tags are "RB" (for adverb)
dependants = [token for token in dependants if token.pos == "RB"]
# Or we could filter that list, keeping only those tokens whose form is NOT "main" or "special"
# Given a ParsedSentence object, and an aspect token acquired from it (as in the previous section)
# Get the head of the aspect token
head_token = parsed_sentence.get_head(aspect_token)
print head_token
# Given a ParsedSentence object, and an aspect token acquired from it (as in the previous section)
# Get all of the dependants of that aspect token.
dependants = parsed_sentence.get_dependants(aspect_token)
# You could print them out for inspection
for dependant in dependants:
print dependant
aspect = "dialogue"
# If you have a ParsedSentence object, you can get all the tokens whose form matches the aspect as shown below.
# So instead of just printing the parsed_sentence as in the previous section, get its aspect tokens and print them.
aspect_tokens = parsed_sentence.get_query_tokens(aspect)
# You could iterate over them and print them for inspection
for aspect_token in aspect_tokens:
print aspect_token
@language-engineering
language-engineering / gist:7320227
Last active November 18, 2016 18:11
This show you how to search the DVD amazon reviews for a particular query token.
from sussex_nltk.parse import load_parsed_dvd_sentences
aspect = "dialogue" # Our aspect word
parsed_sentences = load_parsed_dvd_sentences(aspect)
# To inspect the sentences, you could print them straight out
for parsed_sentence in parsed_sentences:
print "--- Sentence ---"
print parsed_sentence