Skip to content

Instantly share code, notes, and snippets.

@mohdsanadzakirizvi
Last active February 13, 2019 08:26
Show Gist options
  • Save mohdsanadzakirizvi/9f967917079b53e2db997519e1d793bf to your computer and use it in GitHub Desktop.
Save mohdsanadzakirizvi/9f967917079b53e2db997519e1d793bf to your computer and use it in GitHub Desktop.
Intro to Stanford NLP
#get the dependency parse of the first sentence
print('---')
print('dependency parse of first sentence')
dependency_parse = sentence.basicDependencies
print(dependency_parse)
# get the first token of the first sentence
print('---')
print('first token of first sentence')
token = sentence.token[0]
print(token)
# get the part-of-speech tag
print('---')
print('part of speech tag of token')
token.pos
print(token.pos)
# get the named entity tag
print('---')
print('named entity tag of token')
print(token.ner)
# get an entity mention from the first sentence
print('---')
print('first entity mention in sentence')
print(sentence.mentions[0])
# access the coref chain
print('---')
print('coref chains for the example')
print(ann.corefChain)
# Use tokensregex patterns to find who wrote a sentence.
pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
matches = client.tokensregex(text, pattern)
# sentences contains a list with matches for each sentence.
assert len(matches["sentences"]) == 3
# length tells you whether or not there are any matches in this
assert matches["sentences"][1]["length"] == 1
# You can access matches like most regex groups.
matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence"
matches["sentences"][1]["0"]["1"]["text"] == "Chris"
# Use semgrex patterns to directly find who wrote what.
pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
matches = client.semgrex(text, pattern)
# sentences contains a list with matches for each sentence.
assert len(matches["sentences"]) == 3
# length tells you whether or not there are any matches in this
assert matches["sentences"][1]["length"] == 1
# You can access matches like most regex groups.
matches["sentences"][1]["0"]["text"] == "wrote"
matches["sentences"][1]["0"]["$subject"]["text"] == "Chris"
matches["sentences"][1]["0"]["$object"]["text"] == "sentence"
from stanfordnlp.server import CoreNLPClient
# example text
print('---')
print('input text')
print('')
text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people."
print(text)
# set up the client
print('---')
print('starting up Java Stanford CoreNLP Server...')
# set up the client
with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','depparse','coref'], timeout=30000, memory='16G') as client:
# submit the request to the server
ann = client.annotate(text)
# get the first sentence
sentence = ann.sentence[0]
import pandas as pd
#extract lemma
def extract_lemma(doc):
parsed_text = {'word':[], 'lemma':[]}
for sent in doc.sentences:
for wrd in sent.words:
#extract text and lemma
parsed_text['word'].append(wrd.text)
parsed_text['lemma'].append(wrd.lemma)
#return a dataframe
return pd.DataFrame(parsed_text)
#call the function on doc
extract_lemma(doc)
#dictionary that contains pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word','IN': 'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take','VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
}
#extract parts of speech
def extract_pos(doc):
parsed_text = {'word':[], 'pos':[], 'exp':[]}
for sent in doc.sentences:
for wrd in sent.words:
if wrd.pos in pos_dict.keys():
pos_exp = pos_dict[wrd.pos]
else:
pos_exp = 'NA'
parsed_text['word'].append(wrd.text)
parsed_text['pos'].append(wrd.pos)
parsed_text['exp'].append(pos_exp)
#return a dataframe of pos and text
return pd.DataFrame(parsed_text)
#extract pos
extract_pos(doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment