Skip to content

Instantly share code, notes, and snippets.

@jtauber jtauber/convert_to_dependency.py Secret

Last active Aug 29, 2015
Embed
What would you like to do?
#!/usr/bin/env python3
"""
Convert GBI Syntax Trees to a flat, dependency representation.
"""
from xml.etree import ElementTree
def morph_id(gbi_code):
"""
Convert the morphId to use book numbers starting at 61 like SBLGNT.
"""
if gbi_code is None:
return None
return str(int(gbi_code[0:2]) + 21) + gbi_code[2:]
def head(node):
"""
Get the child of the given node that is the head.
"""
if "Head" not in node.attrib:
# In these cases no head is given so we explictly choose which one
# to use ourselves. Note: we could use a similar approach to
# systematically *change* the head if we don't agree with the GBI
# choice.
assert node.attrib["Rule"] in ["Conj-CL", "sub-CL", "that-VP"]
return node[1]
else:
return node[int(node.attrib["Head"])]
def recursive_head(node):
"""
Follow the heads down from the given node to get the leaf node
"""
if len(node) == 0:
return node
else:
return recursive_head(head(node))
def output_node(node, ancestors=None):
"""
Recurse the node, outputting rows with dependency information.
The head of each leaf node is basically found by walking up the tree until
hitting a node whose head has changed from the node came from. If the head
never changes, we have the head over the overall sentence.
"""
ancestors = ancestors or []
if len(node) == 0: # a leaf node
word_id = morph_id(node.attrib["morphId"])
text = node.attrib["Unicode"].strip()
path = [node]
# build path, stopping at (but including) point where the head of the
# current node is not the child we just came from
for ancestor in ancestors:
path.append(ancestor)
if head(ancestor) != path[-2]:
break
# now need to follow the Head attribute from the last in the path
# back down to get the morphId
head_id = morph_id(recursive_head(path[-1]).attrib["morphId"])
# if we got back the same id, there is no head so use None
if head_id == word_id:
head_id = None
# the dependency type is the Cat of the node one level down in the path
dependency_type = path[-2].attrib.get("Cat")
print("{} {} {} {}".format(word_id, text, head_id, dependency_type))
else: # a non-leaf node
for child in node:
output_node(child, [node] + ancestors)
def convert(filename):
tree = ElementTree.parse(filename)
root = tree.getroot()
# just uses the first tree when there is more than one analysis (for now)
for tree in root.findall("./Sentence/Trees/Tree[1]"):
output_node(tree[0])
if __name__ == "__main__":
import sys
convert(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.