Last active
November 20, 2021 11:49
-
-
Save hadifar/061d0523b6a6d5e7b1b9cebdeb251f14 to your computer and use it in GitHub Desktop.
I copy the code from this repo: https://github.com/iamrkg31/sentence-to-clauses
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
def get_verb_phrases(t): | |
verb_phrases = [] | |
num_children = len(t) | |
num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children)) | |
if t.label() != "VP": | |
for i in range(0, num_children): | |
if t[i].height() > 2: | |
verb_phrases.extend(get_verb_phrases(t[i])) | |
elif t.label() == "VP" and num_VP > 1: | |
for i in range(0, num_children): | |
if t[i].label() == "VP": | |
if t[i].height() > 2: | |
verb_phrases.extend(get_verb_phrases(t[i])) | |
else: | |
verb_phrases.append(' '.join(t.leaves())) | |
return verb_phrases | |
def get_pos(t): | |
vp_pos = [] | |
sub_conj_pos = [] | |
num_children = len(t) | |
children = [t[i].label() for i in range(0, num_children)] | |
flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children)) | |
if "VP" in children and not flag: | |
for i in range(0, num_children): | |
if t[i].label() == "VP": | |
vp_pos.append(t[i].treeposition()) | |
elif not "VP" in children and not flag: | |
for i in range(0, num_children): | |
if t[i].height() > 2: | |
temp1, temp2 = get_pos(t[i]) | |
vp_pos.extend(temp1) | |
sub_conj_pos.extend(temp2) | |
# comment this "else" part, if want to include subordinating conjunctions | |
else: | |
for i in range(0, num_children): | |
if t[i].label() in ["S", "SBAR", "SBARQ", "SINV", "SQ"]: | |
temp1, temp2 = get_pos(t[i]) | |
vp_pos.extend(temp1) | |
sub_conj_pos.extend(temp2) | |
else: | |
sub_conj_pos.append(t[i].treeposition()) | |
return (vp_pos, sub_conj_pos) | |
def print_clauses(parse_str): | |
sent_tree = nltk.tree.ParentedTree.fromstring(parse_str) | |
clause_level_list = ["S", "SBAR", "SBARQ", "SINV", "SQ"] | |
clause_list = [] | |
sub_trees = [] | |
# sent_tree.pretty_print() | |
# break the tree into subtrees of clauses using | |
# clause levels "S","SBAR","SBARQ","SINV","SQ" | |
for sub_tree in reversed(list(sent_tree.subtrees())): | |
if sub_tree.label() in clause_level_list: | |
if sub_tree.parent().label() in clause_level_list: | |
continue | |
if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP" | |
and not sub_tree.parent().label() in clause_level_list): | |
continue | |
sub_trees.append(sub_tree) | |
del sent_tree[sub_tree.treeposition()] | |
# for each clause level subtree, extract relevant simple sentence | |
for t in sub_trees: | |
# get verb phrases from the new modified tree | |
verb_phrases = get_verb_phrases(t) | |
# get tree without verb phrases (mainly subject) | |
# remove subordinating conjunctions | |
vp_pos, sub_conj_pos = get_pos(t) | |
for i in reversed(vp_pos): | |
del t[i] | |
for i in reversed(sub_conj_pos): | |
del t[i] | |
subject_phrase = ' '.join(t.leaves()) | |
# update the clause_list | |
for i in verb_phrases: | |
clause_list.append(subject_phrase + " " + i) | |
print(clause_list) | |
return clause_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment