Skip to content

Instantly share code, notes, and snippets.

@mbuff24
Created November 16, 2014 17:45
Show Gist options
  • Save mbuff24/8e32df0a66d0f09571da to your computer and use it in GitHub Desktop.
Save mbuff24/8e32df0a66d0f09571da to your computer and use it in GitHub Desktop.
# Treebank Parser tag definitions
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#
# Triplet Extraction paper
# http://ailab.ijs.si/dunja/SiKDD2007/Papers/Rusu_Trippels.pdf
import json
ADJECTIVE_TYPES = ["JJ", "JJR", "JJS"]
NOUN_TYPES = ["NN", "NNP", "NNPS", "NNS"]
VERB_TYPES = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
ADVERB_TYPES = ["RB", "RBR", "RBS"]
def findFirstType(tag_type, tree):
subtrees = []
for t in tree:
if type(t) is list:
if t[1] in tag_type:
return t[0]
elif type(t) is dict:
for key in t.keys():
subtrees.extend(t[key])
else:
print("t isn't list or dict, wtfman")
return findFirstType(tag_type, subtrees)
def findDeepestType_r(tag_type, tree, candidates):
subtrees = []
for t in tree:
if type(t) is list:
# TODO: add some pruning here
# ie. Verbs are only in VPs
if t[1] in tag_type:
candidates.append(t[0])
elif type(t) is dict:
for key in t.keys():
subtrees.extend(t[key])
else:
print("t isn't list or dict, wtfman")
if len(subtrees) == 0:
return candidates
else:
return findDeepestType_r(tag_type, subtrees, candidates)
def findDeepestType(tag_type, tree):
verbs = findDeepestType_r(tag_type, tree, [])
index = len(verbs)
return verbs[index-1]
def findAllTreesOfType(tag_type, tree, acc):
subtrees = []
for t in tree:
if type(t) is dict:
for key in t.keys():
if key in tag_type:
acc.append(t)
else:
subtrees.extend(t[key])
if len(subtrees) == 0:
return acc
else:
return findAllTreesOfType(tag_type, subtrees, acc)
# def extract_attributes(word):
# if isAdjective(word)
# # all RB siblings
# #result =
# return word
def extract_subject(np):
subject = None
# subject = first noun found in NP_subtree
subject = findFirstType(NOUN_TYPES, np)
#extract attributes next..
return subject
def extract_predicate(vp):
verb = None
# verb = the deepest verb in VP_subtree
verb = findDeepestType(VERB_TYPES, vp)
#extract attributes next..
return verb
def extract_object(vp):
# siblings = all NP, PP, ADJP siblings of vp
types = ["NP", "PP", "ADJP"]
siblings = findAllTreesOfType(types, vp, [])
for sibling in siblings:
for key in sibling.keys():
if key == "NP":
# obj = first noun in sibling
return findFirstType(NOUN_TYPES, sibling["NP"])
elif key == "PP":
return findFirstType(NOUN_TYPES, sibling["PP"])
elif key == "ADJP":
# obj = first adjective in sibling
return findFirstType(ADJECTIVE_TYPES, sibling["ADJP"])
else:
print("key isn't in [NP, PP, ADJP], wtfman")
#extract attributes somewhere in here..
return None
def extract_triplet(sent):
for root in json:
sent = root["S"]
NP = None
VP = None
# S -> NP, VP
for phrase in sent:
if "NP" in phrase:
NP = phrase["NP"]
elif "VP" in phrase:
VP = phrase["VP"]
print extract_subject(NP)
print extract_predicate(VP)
print extract_object(VP)
# read json files!!
json = json.loads(open('./test.json').read())
extract_triplet(json)
[
{
"S": [
{
"NP": [
[
"A",
"DT"
],
[
"rare",
"JJ"
],
[
"black",
"JJ"
],
[
"squirrel",
"NN"
]
]
},
{
"VP": [
[
"has",
"VBZ"
],
{
"VP": [
[
"become",
"VBN"
],
{
"NP": [
[
"a",
"DT"
],
[
"regular",
"JJ"
],
[
"visitor",
"NN"
]
]
},
{
"PP": [
[
"to",
"TO"
],
{
"NP": [
[
"a",
"DT"
],
[
"suburban",
"JJ"
],
[
"garden",
"NN"
]
]
}
]
}
]
}
]
}
]
}
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment