Rename numeric entity labels in .xmi file to text of first mention
"""Rename numeric entity labels in .xmi file to text of first mention.
Usage: python3 <FILE>...
Original file is modified in-place.
Only non-empty entities with numeric names are changed.
import os
import sys
from lxml import etree
def conv(filename):
"""Produce new version of given file with relabeled entities."""
doc = etree.parse(filename)
root = doc.getroot()
text = root.find('./cas:Sofa', root.nsmap).get('sofaString')
firstmention = {}
for mention in root.findall('./v1:Mention', root.nsmap):
entityid = mention.get('Entity')
span = (int(mention.get('begin')), int(mention.get('end')))
if entityid not in firstmention:
firstmention[entityid] = span
elif span[0] < firstmention[entityid][0]:
firstmention[entityid] = span
for entity in root.findall('./v1:Entity', root.nsmap):
# skip empty entities we haven't seen mentions for
if (entity.get('Label').isnumeric()
and entity.get('{}id') in firstmention):
begin, end = firstmention[entity.get('{}id')]
entity.attrib['Label'] = text[begin:end]
with open(filename, 'wb') as out:
out.write(etree.tostring(doc, pretty_print=False,
xml_declaration=True, encoding='UTF-8'))
def main():
if len(sys.argv[1:]):
for filename in sys.argv[1:]:
if not os.path.exists(filename):
print('File not found: %s' % filename)
for filename in sys.argv[1:]:
print('Processing: %s' % filename)
if __name__ == '__main__':
