Skip to content

Instantly share code, notes, and snippets.

@leebird
Last active April 8, 2016 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leebird/1777642b0f15a87179327046e7744cdd to your computer and use it in GitHub Desktop.
Save leebird/1777642b0f15a87179327046e7744cdd to your computer and use it in GitHub Desktop.
Simple JSON-to-BioC convertor
from __future__ import unicode_literals
import sys
import codecs
import json
from lxml import etree
# See http://lxml.de/api.html#incremental-xml-generation
# for incremental XML generation used below.
def write_entity(xf, entity):
start = str(entity['charStart'])
length = str(entity['charEnd'] - entity['charStart'] + 1)
text = doc['text'][entity['charStart']:entity['charEnd'] + 1]
with xf.element('annotation', id=entity['duid']):
with xf.element('infon', key='type'):
xf.write(entity['entityType'])
with xf.element('infon', key='source'):
xf.write(entity['source'])
el = etree.Element('location',
offset=start,
length=length)
xf.write(el)
with xf.element('text'):
xf.write(text)
for entity_id in entity['entityId']:
with xf.element('infon', key=entity_id['source']):
xf.write(entity_id['idString'])
for attr in entity['attribute']:
with xf.element('infon', key=attr['key']):
xf.write(attr['value'])
json_file_path, xml_file_path = sys.argv[1], sys.argv[2]
with codecs.open(json_file_path, 'r', 'utf8') as jf, \
etree.xmlfile(xml_file_path, encoding='utf8') as xf:
with xf.element('collection'):
el = etree.Element('source')
xf.write(el)
el = etree.Element('date')
xf.write(el)
el = etree.Element('key')
xf.write(el)
for line in jf:
doc = json.loads(line.strip())
if doc['docId'] != '25789565':
continue
sentences = [e for e in doc['entity'] if e['entityType'] == 'SENTENCE']
# Sort sentences by its character offset.
sentences = sorted(sentences, key=lambda s: s['charStart'])
with xf.element('document'):
with xf.element('id'):
xf.write(doc['docId'])
with xf.element('passage'):
with xf.element('offset'):
# We only have abstract as the only one passage,
# so just write offset as 0 for it.
xf.write('0')
if len(sentences) == 0:
# If there are no sentences, just write the text
# and entities.
with xf.element('text'):
xf.write(doc['text'])
for entity in doc['entity']:
write_entity(xf, entity)
else:
# If there are sentences, write the entities to
# each corresponding sentence.
entities = [e for e in doc['entity'] if e['entityType'] != 'SENTENCE']
entities = sorted(entities, key=lambda e: e['charStart'])
for sent in sentences:
sent_text = doc['text'][sent['charStart']:sent['charEnd']+1]
with xf.element('sentence'):
with xf.element('offset'):
xf.write(str(sent['charStart']))
with xf.element('text'):
xf.write(sent_text)
for entity in entities:
if entity['charStart'] > sent['charEnd']:
break
if entity['charStart'] < sent['charStart']:
continue
write_entity(xf, entity)
# Write the relations.
for relation in doc['relation']:
with xf.element('relation', id=relation['duid']):
with xf.element('infon', key='relation type'):
xf.write(relation['relationType'])
with xf.element('infon', key='source'):
xf.write(relation['source'])
for arg in relation['argument']:
el = etree.Element('node', refid=arg['entity_duid'], role=arg['role'])
xf.write(el)
for attr in relation['attribute']:
with xf.element('infon', key=attr['key']):
xf.write(attr['value'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment