Skip to content

Instantly share code, notes, and snippets.

@hadifar
Last active November 20, 2021 11:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hadifar/061d0523b6a6d5e7b1b9cebdeb251f14 to your computer and use it in GitHub Desktop.
Save hadifar/061d0523b6a6d5e7b1b9cebdeb251f14 to your computer and use it in GitHub Desktop.
I copy the code from this repo: https://github.com/iamrkg31/sentence-to-clauses
import re
import nltk
def get_verb_phrases(t):
verb_phrases = []
num_children = len(t)
num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))
if t.label() != "VP":
for i in range(0, num_children):
if t[i].height() > 2:
verb_phrases.extend(get_verb_phrases(t[i]))
elif t.label() == "VP" and num_VP > 1:
for i in range(0, num_children):
if t[i].label() == "VP":
if t[i].height() > 2:
verb_phrases.extend(get_verb_phrases(t[i]))
else:
verb_phrases.append(' '.join(t.leaves()))
return verb_phrases
def get_pos(t):
vp_pos = []
sub_conj_pos = []
num_children = len(t)
children = [t[i].label() for i in range(0, num_children)]
flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))
if "VP" in children and not flag:
for i in range(0, num_children):
if t[i].label() == "VP":
vp_pos.append(t[i].treeposition())
elif not "VP" in children and not flag:
for i in range(0, num_children):
if t[i].height() > 2:
temp1, temp2 = get_pos(t[i])
vp_pos.extend(temp1)
sub_conj_pos.extend(temp2)
# comment this "else" part, if want to include subordinating conjunctions
else:
for i in range(0, num_children):
if t[i].label() in ["S", "SBAR", "SBARQ", "SINV", "SQ"]:
temp1, temp2 = get_pos(t[i])
vp_pos.extend(temp1)
sub_conj_pos.extend(temp2)
else:
sub_conj_pos.append(t[i].treeposition())
return (vp_pos, sub_conj_pos)
def print_clauses(parse_str):
sent_tree = nltk.tree.ParentedTree.fromstring(parse_str)
clause_level_list = ["S", "SBAR", "SBARQ", "SINV", "SQ"]
clause_list = []
sub_trees = []
# sent_tree.pretty_print()
# break the tree into subtrees of clauses using
# clause levels "S","SBAR","SBARQ","SINV","SQ"
for sub_tree in reversed(list(sent_tree.subtrees())):
if sub_tree.label() in clause_level_list:
if sub_tree.parent().label() in clause_level_list:
continue
if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
and not sub_tree.parent().label() in clause_level_list):
continue
sub_trees.append(sub_tree)
del sent_tree[sub_tree.treeposition()]
# for each clause level subtree, extract relevant simple sentence
for t in sub_trees:
# get verb phrases from the new modified tree
verb_phrases = get_verb_phrases(t)
# get tree without verb phrases (mainly subject)
# remove subordinating conjunctions
vp_pos, sub_conj_pos = get_pos(t)
for i in reversed(vp_pos):
del t[i]
for i in reversed(sub_conj_pos):
del t[i]
subject_phrase = ' '.join(t.leaves())
# update the clause_list
for i in verb_phrases:
clause_list.append(subject_phrase + " " + i)
print(clause_list)
return clause_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment