Created
July 7, 2018 10:18
-
-
Save avidanyum/3edfbc96ea22807445ab5307830d41db to your computer and use it in GitHub Desktop.
sentence deletion invalid index
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import urlopen | |
import json | |
response = urlopen('https://raw.githubusercontent.com/zhaohengyang/Generate-Parallel-Data-for-Sentence-Compression/master/sample_data/parallel_data_example.json') | |
# with open('sample_data/parallel_data_example.json', 'r') as fp: | |
# data_example | |
data_example = json.load(response) | |
print(data_example[:2]) | |
import os | |
import sys | |
import traceback | |
import copy | |
import pydot | |
import os | |
from matplotlib.pyplot import imshow | |
import numpy as np | |
import matplotlib.image as mpimg | |
import matplotlib.pyplot as plt | |
import spacy | |
print('if you didnt run: python -m spacy download en') | |
import spacy.lang.en | |
nlp = spacy.load('en') | |
from spacy.tokens.span import Span | |
import copy | |
from nltk.stem.porter import PorterStemmer | |
from spacy.attrs import ORTH, DEP, HEAD | |
merge_rules = {"group1": ["aux", "auxpass", "det", "nummod", "case", | |
"prt", "poss", "of", "nmod", "compound", | |
"neg", "xcomp", "quantmod", "advmod", "attr", | |
"pobj", "as", "aux", "dobj", "amod", | |
"npadvmod"], | |
"group2": ["cc"], | |
"group3": ["mark", "," ]} | |
worked = {"npadvmod": [30, 37], "to": [40], "mark": [67], "attr": [58], "pobj": [69], "punct": [69], | |
"conj": [69], "dobj": [77], "nsubj": [84, 86], "amod": [98], "ccomp": [4]} | |
not_worked = {"npadvmod": [30], "mark": [41], "cc": [38], "advcl": [41], "pobj": [45], "with": [63], | |
"conj": [71, 76], "nsubj": [47, 77], "ccomp": [16]} | |
# Resize and clean edges | |
def plot_im(im, dpi=80): | |
py,px,_ = im.shape # depending of your matplotlib.rc you may have to use py,px instead | |
size = (py/np.float(dpi), px/np.float(dpi)) # note the np.float() | |
fig = plt.figure(figsize=size, dpi=dpi) | |
# fig = plt.figure(figsize=(10,20), dpi=dpi) | |
ax = fig.add_axes([0, 0, 1, 1]) | |
# Customize the axis | |
# remove top and right spines | |
ax.spines['right'].set_color('none') | |
ax.spines['left'].set_color('none') | |
ax.spines['top'].set_color('none') | |
ax.spines['bottom'].set_color('none') | |
# turn off ticks | |
ax.xaxis.set_ticks_position('none') | |
ax.yaxis.set_ticks_position('none') | |
ax.xaxis.set_ticklabels([]) | |
ax.yaxis.set_ticklabels([]) | |
ax.imshow(im) | |
plt.show() | |
def get_decode(s): | |
return unicode(s).encode("utf-8") | |
class Tree_node(): | |
# Initialize tree | |
def __init__(self, node): | |
self.node = node | |
# Get node's id | |
def id(self): | |
return self.node[u'word'][self.node['head_word_index']]['id'] | |
# Get head word tag | |
def head_word_tag(self): | |
return self.node[u'word'][self.node['head_word_index']]['tag'] | |
# Get head word stem | |
def head_word_stem(self): | |
return self.node[u'word'][self.node['head_word_index']]['stem'] | |
# Get head word | |
def head_word(self): | |
return self.node[u'word'][self.node['head_word_index']] | |
# Get tag of each word in node | |
def tags(self): | |
return [word['tag'] for word in self.node['word']] | |
# Get stem of each word in node | |
def stems(self): | |
return [word['stem'] for word in self.node['word']] | |
# Get form of each word in node | |
def forms(self): | |
return [word['form'] for word in self.node['word']] | |
# Get id of each word in node | |
def ids(self): | |
return [word['id'] for word in self.node['word']] | |
# Get edge | |
def edge(self): | |
return self.node['edge'] | |
# Get form | |
def form(self): | |
return self.node['form'] | |
# Get edge label | |
def edge_label(self): | |
return self.node['edge']['label'] | |
# Get edge parent id | |
def edge_parent_id(self): | |
return self.node['edge']['parent_id'] | |
# Get word | |
def word(self): | |
return self.node['word'] | |
# Set new parent id | |
def set_parent_id(self, parent_id): | |
self.node['edge']['parent_id'] = parent_id | |
# Set new form | |
def set_form(self, form): | |
self.node['form'] = form | |
# Set new word | |
def set_word(self, word): | |
self.node['word'] = word | |
# Set head word index | |
def set_head_word_index(self, index): | |
self.node['head_word_index'] = index | |
# Set edge label | |
def set_edge_label(self, label): | |
self.node['edge']['label'] = label | |
# Show form id combination | |
def node_forms_and_ids(self): | |
return " ".join([word[u'form'] + "_" + str(word[u'id']) | |
for word in self.word()]) | |
# Show node | |
def describe(self): | |
return get_decode("node: {:<20} head_word_id:{:<20}".format(self.node_forms_and_ids(), self.id())) | |
class Parsed_Tree(): | |
''' | |
Manage and maintain an Tree | |
''' | |
# Initialize from string | |
def __init__(self, nodes): | |
self.tree = nodes | |
def get_copy(self): | |
return copy.deepcopy(self) | |
# Delete node | |
def remove_node(self, node): | |
self.tree.remove(node) | |
# Add node | |
def append_node(self, node): | |
self.tree.append(node) | |
# Any children of A will point to B | |
def update_children(self, A, B): | |
for child in self.children(A): | |
child.set_parent_id(B.id()) | |
# Merge A to B (parent of A) | |
def merge(self, A, B): | |
parent_head_word = B.head_word()['form'] | |
new_word = A.word() + B.word() | |
new_word.sort(key=lambda x: x['id']) | |
word_list = [word[u"form"] for word in new_word] | |
B.set_word(new_word) | |
B.set_form(" ".join(word_list)) | |
B.set_head_word_index(word_list.index(parent_head_word)) | |
self.update_children(A, B) | |
self.remove_node(A) | |
# Insert between A, B(child of A) | |
def insert_between(self, node, A, B): | |
# node point to A | |
node.set_parent_id(A.id()) | |
# B point to node | |
B.set_parent_id(node.id()) | |
# Get children node | |
def children(self, node): | |
children = [] | |
for child_node in self.tree: | |
if node.id() == child_node.edge_parent_id(): | |
children.append((child_node)) | |
return children | |
# Find parent node | |
def find_parent_node(self, node): | |
return self.find_node_by_id(node.edge_parent_id()) | |
# check consistency | |
def consistency(self): | |
assert all([self.find_parent_node(node) for node in self.tree]) | |
assert len(self.all_roots()) == 1 | |
# check if node is root node | |
def is_root(self, node): | |
return node.id() == node.edge_parent_id() | |
# Get all root nodes if it has more than one (shouldn't), used for check consistency | |
def all_roots(self): | |
# The nodes is a tree, each node has one edge to it's parent | |
return [node for node in self.tree if self.is_root(node)] | |
# Get root node | |
def root_node(self): | |
return self.all_roots()[0] | |
# Get path to root | |
def path_to_root(self, node): | |
path = [] | |
current_node = node | |
while not self.is_root(current_node): | |
path.append(current_node) | |
current_node = self.find_parent_node(current_node) | |
path.append(current_node) | |
return path | |
# Get path from A to B | |
def path(self, A, B, debug = False): | |
self.consistency() | |
A_path_root = [node.id() for node in self.path_to_root(A)] | |
B_path_root = [node.id() for node in self.path_to_root(B)] | |
B_path_root.reverse() | |
joined = set(A_path_root) & set(B_path_root) | |
up = copy.deepcopy(A_path_root) | |
[up.remove(item) for item in joined] | |
down = copy.deepcopy(B_path_root) | |
[down.remove(item) for item in joined] | |
[A_path_root.remove(item) for item in up] | |
top = [] + A_path_root[:1] | |
if debug: | |
print("up:", up) | |
print("top", top) | |
print("down:", down) | |
return up, top, down | |
# Add an dummy on top of original root | |
def add_dummy_root(self): | |
# -- Add dummy root node | |
# Create an dummy root node, append it to node list | |
dummy_root_id = -1 | |
dummy_root = {u'form': u'ROOT', | |
u'head_word_index': 0, | |
u'word': [{u'tag': u'ROOT', | |
# u'dep': u'ROOT_To_Self', | |
u'id': dummy_root_id, | |
u'form': u'ROOT', | |
u'stem': u'ROOT'}], | |
u'edge': {u'parent_id': dummy_root_id, u'label': u'ROOT_To_Self'} | |
} | |
self.append_node(Tree_node(dummy_root)) | |
# Find original root node, which contains self pointed edge | |
root_node = self.root_node() | |
root_node.set_parent_id(dummy_root_id) | |
# Get node given id | |
def find_node_by_id(self, id, debug=False): | |
found = None | |
if debug: | |
print("Debug ----- find_node_by_id ----- ") | |
print("target id:", id) | |
print([node.id() for node in self.tree]) | |
for node in self.tree: | |
if id in node.ids(): | |
found = node | |
break | |
return found | |
# Check if node is in the tree | |
def is_node_in(self, node): | |
if self.find_node_by_id(node.id()): | |
return True | |
else: | |
return False | |
# Find neighbor nodes | |
def find_neighbor(self, node, debug = False): | |
node_ids = [tree_node.id() for tree_node in self.tree] | |
node_ids.sort() | |
rights = filter(lambda x: x > node.id(), node_ids)[:1] | |
right = next(iter(rights), None) | |
node_ids.reverse() | |
lefts = filter(lambda x: x < node.id(),node_ids )[:1] | |
left = next(iter(lefts), None) | |
left_node = self.find_node_by_id(left) if left else None | |
right_node = self.find_node_by_id(right) if right else None | |
if debug: | |
print("find_neighbor:") | |
print("ids: {}".format(node_ids)) | |
print("node: {}, left: {}, right: {}".format(node.id(), left, right)) | |
return left_node, right_node | |
# Print tree | |
def print_edges(self, debug=False): | |
def get_tags(node): | |
return get_decode(",".join(node.tags())) | |
for node in self.tree: | |
parent_id = node.edge_parent_id() | |
parent_node = self.find_node_by_id(parent_id) | |
if node.edge_label(): | |
if not parent_node: | |
self.find_node_by_id(parent_id, debug=True) | |
print(get_decode("{:<20}:{:<20}{}->{}".format(node.id(), node.edge_label(), node.form(), parent_id))) | |
else: | |
print(get_decode("{:<20}:{:<20}{}->{}[{}]".format(node.id(), node.edge_label(), | |
node.form(), | |
parent_node.form(), | |
get_tags(parent_node) | |
))) | |
# Print a graphic tree | |
def print_graph(self, color_settings = {}): | |
graph = pydot.Dot(graph_type='digraph') | |
name_to_node = {} | |
id_to_color = {} | |
for color, ids in color_settings.items(): | |
for id in ids: | |
id_to_color[id] = color | |
all_ids = [node.id() for node in self.tree] | |
for id in all_ids: | |
color = id_to_color.get(id, 'gray') | |
node = self.find_node_by_id(id) | |
node_label = node.node_forms_and_ids() | |
name_to_node[node_label] = pydot.Node(node_label, style="filled", fillcolor=color) | |
for node in name_to_node.values(): | |
graph.add_node(node) | |
for edge in self.get_edges(): | |
node_a_name, node_b_name = edge | |
graph.add_edge(pydot.Edge(name_to_node[node_a_name], name_to_node[node_b_name])) | |
graph.write_png('graph.png') | |
img = mpimg.imread('graph.png') | |
plot_im(img, dpi=40) | |
os.remove("graph.png") | |
# Get and edge from a node to its parent | |
def get_edge(self, node): | |
parent_id = node.edge_parent_id() | |
parent_node = self.find_node_by_id(parent_id) | |
return (parent_node.node_forms_and_ids(), node.node_forms_and_ids()) | |
# Get tree edges | |
def get_edges(self): | |
return [self.get_edge(node) for node in self.tree] | |
class Sentence_Reduction(object): | |
def __init__(self, sentence_tree, headline_tree): | |
# parse sentence into tree structure | |
self.sentence_tree = sentence_tree | |
# parse headline into tree structure | |
self.headline_tree = headline_tree | |
# Transfer headline into transfered_headline | |
self.transfered_headline = None | |
# Transfer sentence_tree into transfered_tree | |
self.transfered_tree = None | |
# Flat transfered_tree into flatten_tree | |
self.flatten_tree = None | |
# Given headline_tree, reduce flatten_tree into reduced_tree | |
self.reduced_tree = None | |
# Get reduced_sentence from reduced_tree | |
self.reduced_sentence = None | |
def transfer_tree(self, debug=False): | |
# Start | |
self.transfered_tree = self.sentence_tree.get_copy() | |
self.transfered_tree.consistency() | |
# Add dummy root | |
self.transfered_tree.add_dummy_root() | |
# Remove node that falls in ignore rules | |
def ignore_node(node): | |
# ignore_rules = ["``", "''", "'"] | |
ignore_rules = [] | |
if node.form() in ignore_rules: | |
parent_node = self.transfered_tree.find_parent_node(node) | |
self.transfered_tree.update_children(node, parent_node) | |
return True | |
else: | |
return False | |
# remove node in ignore rules | |
self.transfered_tree.tree[:] = [node for node in self.transfered_tree.tree if not ignore_node(node)] | |
# -- preposition, punctuation replacement | |
part_of_speach = ['prep', 'punct'] | |
for node in self.transfered_tree.tree: | |
if node.edge_label() in part_of_speach: | |
node.set_edge_label(node.head_word()['form']) | |
# -- move conjunction word | |
for node in self.transfered_tree.tree: | |
if node.edge_label() in merge_rules['group2']: | |
# print("Found cc node: node label: {:<20} id: {:<20} form: {:<20}".format(node.edge_label(), node.id(), node.form())) | |
_, right_neighbor = self.transfered_tree.find_neighbor(node) | |
up, top, down = self.transfered_tree.path(node, right_neighbor) | |
if up and down: | |
A_node = self.transfered_tree.find_node_by_id(top[0]) | |
B_node = self.transfered_tree.find_node_by_id(down[0]) | |
self.transfered_tree.insert_between(node, A_node, B_node) | |
# Take a transfered tree and flat it | |
def flat_tree(self): | |
self.flatten_tree = self.transfered_tree.get_copy() | |
self.flatten_tree.consistency() | |
# Remove node each time after merged to its parent node | |
for node in list(self.flatten_tree.tree): | |
if node.edge_label() in merge_rules['group1']: | |
self.flatten_tree.merge(node, self.flatten_tree.find_parent_node(node)) | |
# Transfer headline | |
def transfer_headline(self, debug = False): | |
self.transfered_headline = self.headline_tree.get_copy() | |
self.transfered_headline.consistency() | |
# Remove node that falls in ignore rules | |
def ignore_node(node): | |
headline_ignore_rules = ['IN', '``', "''", "DT", ':', '.', 'POS'] | |
if node.head_word_tag() in headline_ignore_rules: | |
parent_node = self.transfered_headline.find_parent_node(node) | |
self.transfered_headline.update_children(node, parent_node) | |
return True | |
else: | |
return False | |
if debug: | |
for node in self.headline_tree.tree: | |
print("{}--{}".format(get_decode(node.head_word_stem()), get_decode(node.head_word_tag()))) | |
self.transfered_headline.tree[:] = [node for node in self.transfered_headline.tree if not ignore_node(node)] | |
def reduce_sentence_by_headline(self, addtional_stem = None, debug=False): | |
# def is_verb(tag): | |
# return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] | |
def stemming(node): | |
if addtional_stem: | |
stems = [addtional_stem(form.lower()) for form in node.forms()] | |
else: | |
stems = [stem.lower() for stem in node.stems()] | |
return " ".join(stems) | |
def get_node_set(node): | |
stem = stemming(node) | |
return stem.split() | |
def is_not_in_headline(node, headline_stems): | |
return not bool(set(get_node_set(node)) & set(headline_stems)) | |
def check_common_and_update(node, debug = False): | |
node_stems = get_node_set(node) | |
common = set(node_stems) & set(headline_stems) | |
if bool(common): | |
for item in common: | |
headline_stems.remove(item) | |
if debug: | |
print("modified_headline_stems: ", headline_stems) | |
return True | |
else: | |
return False | |
# Start | |
self.reduced_tree = self.flatten_tree.get_copy() | |
self.reduced_tree.consistency() | |
# Get a list of stems and flatten the list | |
headline_stems = [get_node_set(headline_node) for headline_node in self.transfered_headline.tree] | |
headline_stems = [item for itemset in headline_stems for item in itemset] | |
# Get a list of connect words for later use | |
connect_nodes = [node for node in self.reduced_tree.tree if node.edge_label() in merge_rules['group3']] | |
# Keep node that has headline stem | |
self.reduced_tree.tree[:] = [node for node in self.reduced_tree.tree if check_common_and_update(node)] | |
# if headline_stems: | |
# print("{} -- Found unmatched headlines: {}".format(self.reduce_sentence_by_headline.__name__, headline_stems)) | |
# Return each part of the flatten tree, use different color to print graph | |
reduced_tree_ids = [n.id() for n in self.reduced_tree.tree] | |
# Add node on the path to reduced tree | |
nodes_on_the_path = [] | |
processed = [] | |
for index, node in enumerate(self.reduced_tree.tree): | |
path = self.flatten_tree.path_to_root(node) | |
path_label = [self.reduced_tree.is_node_in(node) for node in path] | |
# the last item is zero(dummy root) | |
# Find the first True and the last True | |
# Index in between will be added to reduced graph | |
start, end = path_label.index(True) + 1, len(path_label) - path_label[::-1].index(True) - 1 | |
for node_on_path in path[start:end]: | |
if not self.reduced_tree.is_node_in(node_on_path) and node_on_path.id() not in processed: | |
# print("add current node: {}".format(path[node_index]['form'])) | |
processed.append(node_on_path.id()) | |
nodes_on_the_path.append(node_on_path) | |
# Return each part of the flatten tree, use different color to print graph | |
path_node_ids = [n.id() for n in nodes_on_the_path] | |
self.reduced_tree.tree += nodes_on_the_path | |
def use_connect_word(node): | |
# Find connnect word like "that" or "which", we use them only if both left words | |
# and right words are selected in reduced tree. | |
# The left word is the word right before the connect word | |
# The right word is any word after connect | |
left_word_id = node.id() - 1 | |
left_node, right_node = self.reduced_tree.find_neighbor(node) | |
return self.reduced_tree.find_node_by_id(left_word_id) and right_node | |
# Add connect word fot reduced tree if needed | |
connect_nodes[:] = [node for node in connect_nodes if use_connect_word(node) and not self.reduced_tree.is_node_in(node)] | |
for node in connect_nodes: | |
print("Found connect node: {}".format(node.describe())) | |
# Return each part of the flatten tree, use different color to print graph | |
connect_nodes_ids = [n.id() for n in connect_nodes] | |
self.reduced_tree.tree += connect_nodes | |
# Make reduced tree consistent | |
for reduced_node in self.reduced_tree.tree: | |
if not self.reduced_tree.find_parent_node(reduced_node): | |
reduced_node.set_parent_id(reduced_node.id()) | |
return reduced_tree_ids, path_node_ids, connect_nodes_ids | |
# Generate reduced sentence from reduced node | |
def generate_reduced_sentence(self): | |
id_word_pairs = [(word[u'id'], word[u'form']) for reduced_node in self.reduced_tree.tree | |
for word in reduced_node.word()] | |
id_word_pairs.sort(key=lambda tuple: tuple[0]) | |
self.reduced_sentence = " ".join([tuple[1] for tuple in id_word_pairs]) | |
# Construct tree from sentence | |
def parse_info(sentence): | |
doc = nlp(sentence) | |
heads = [index + item[0] for index, item in enumerate(doc.to_array([HEAD]))] | |
nodes = [{u"form": token.orth_, | |
u"head_word_index": 0, | |
u"word": [{u"id": current_id, | |
# u"dep": doc[current_id].dep_, | |
u"tag": token.tag_, | |
u"form": token.orth_, | |
u"stem": token.lemma_ | |
}], | |
u"edge": {u"parent_id": parent_id, u"label": doc[current_id].dep_} | |
} | |
for current_id, (token, parent_id) in enumerate(zip(doc, heads))] | |
return [Tree_node(node) for node in nodes] | |
#import parallel_data_gen | |
#reload(parallel_data_gen) | |
from pattern.en import conjugate, lemma, lexeme | |
additional_stem = lemma | |
def reduce_sentence(sample, debug = False): | |
sentence = sample['sentence'] | |
headline = sample['headline'] | |
sentence_info = parse_info(sentence) | |
headline_info = parse_info(headline) | |
sentence_nodes = Parsed_Tree(sentence_info) | |
headline_nodes = Parsed_Tree(headline_info) | |
sentence_reduction = Sentence_Reduction(sentence_nodes, headline_nodes) | |
sentence_reduction.transfer_tree() | |
sentence_reduction.flat_tree() | |
sentence_reduction.transfer_headline() | |
reduced_tree_ids, path_node_ids, connect_nodes_ids = sentence_reduction.reduce_sentence_by_headline(additional_stem) | |
sentence_reduction.generate_reduced_sentence() | |
print("-------- Sentence -------------------") | |
print(sentence) | |
print("-------- Headline -------------------") | |
print(headline) | |
print("-------- reduced_sentence -------------------") | |
print(sentence_reduction.reduced_sentence) | |
if debug: | |
print("-------- Sentence tree-------------------") | |
# sentence_reduction.sentence_tree.print_edges() | |
sentence_reduction.sentence_tree.print_graph() | |
print("-------- Transfered tree -------------------") | |
# sentence_reduction.transfered_tree.print_edges() | |
sentence_reduction.transfered_tree.print_graph() | |
print("-------- Flatten tree -------------------") | |
sentence_reduction.flatten_tree.print_graph() | |
# print("-------- Headline tree-------------------") | |
# sentence_reduction.headline_tree.print_graph() | |
print("-------- Transfered Headline tree-------------------") | |
sentence_reduction.transfered_headline.print_graph() | |
print("-------- Reducing tree: matching -------------------") | |
color_settings = {"pink":reduced_tree_ids, "darkkhaki":path_node_ids, "brown3":connect_nodes_ids} | |
sentence_reduction.flatten_tree.print_graph(color_settings) | |
print("-------- Reduced tree: after reduce -------------------") | |
sentence_reduction.reduced_tree.print_graph() | |
start = 50 | |
end = 60 | |
for index, sample in enumerate(data_example[start:end]): | |
sentence = sample['sentence'] | |
headline = sample['headline'] | |
print("") | |
print("----------------{}-------------------".format(index + start)) | |
print("reducing sentence: " + sentence) | |
print("reducing headline: " + headline) | |
try: | |
reduce_sentence(sample) | |
except Exception as e: | |
print(traceback.format_exc()) | |
print("Spacy library couldn't parse sentence into a tree. Please ignore this sentence pair") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment