Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active May 8, 2019 12:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/6bf9e10b2e6956ce32fb777e7efe99cb to your computer and use it in GitHub Desktop.
Save andreasvc/6bf9e10b2e6956ce32fb777e7efe99cb to your computer and use it in GitHub Desktop.
Convert XML output of Stanford CoreNLP to CoNLL 2012 format
"""Convert XML output of Stanford CoreNLP to CoNLL 2012 format.
$ ./corenlp.sh -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref \
-output.printSingletonEntities true \
-file /tmp/example.txt
$ python3 corenlpxmltoconll2012.py example.txt.xml > example.conll`
"""
import re
import sys
from lxml import etree
def gettext(node):
"""Safely read text of lxml node."""
return '-' if node is None else node.text
def splitparse(parse):
"""Split PTB parse tree into parse bits."""
parse = parse.replace('\n', ' ')
result = re.sub(r'\([^\s()]+ [^\s()]+\)([^(]*)', r'*\1\n', parse)
return result.replace(' ', '').splitlines()
def nerspans(tokens):
"""Create NER spans in CoNLL 2012 format from token-based NER labels.
Single token names: ['(PERSON)']; Multiword names: ['(ORG*', '*', '*)']
Will not create nested NER spans such as (University of (California))."""
result = ['*'] * len(tokens)
nerlabels = [gettext(token.find('./NER')) for token in tokens]
for n, ner in enumerate(nerlabels):
if ner == '-' or ner == 'O':
continue
elif n == 0 or nerlabels[n - 1] != ner:
if n == len(tokens) - 1 or nerlabels[n + 1] != ner:
result[n] = '(%s)' % ner
else:
result[n] = '(%s*' % ner
elif n == len(tokens) - 1 or nerlabels[n + 1] != ner:
result[n] = '*)'
return result
def conv(filename):
"""Read CoreNLP XML file and print in CoNLL 2012 format on stdout."""
doc = etree.parse(filename).getroot().find('./document')
docid = doc.find('./docId').text
partid = 0
result = []
for sent in doc.find('./sentences'):
parsebits = splitparse(sent.find('./parse').text)
nerbits = nerspans(sent.find('./tokens'))
result.append([
[docid,
str(partid),
token.get('id', '-'),
gettext(token.find('./word')),
gettext(token.find('./POS')),
parsebits[n],
'-', # predicate lemma
'-', # predicate frameset ID
'-', # word sense
gettext(token.find('./Speaker')),
nerbits[n],
''] # coref chain
for n, token in enumerate(sent.find('./tokens'))])
for clusterid, coref in enumerate(doc.find('./coreference')):
for mention in coref:
sentid = int(mention.find('./sentence').text) - 1
start = int(mention.find('./start').text) - 1
end = int(mention.find('./end').text) - 1
if start == end - 1:
result[sentid][start][-1] += '(' + str(clusterid) + ')|'
else:
result[sentid][start][-1] += '(' + str(clusterid) + '|'
result[sentid][end - 1][-1] += str(clusterid) + ')|'
print('#begin document (%s); part %03d' % (docid, partid))
for chunk in result:
for line in chunk:
line[-1] = line[-1].rstrip('|') or '-'
print('\t'.join(line))
print()
print('#end document')
if __name__ == '__main__':
conv(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment