leebird/converter.py

## converter.py
from __future__ import unicode_literals
import sys
import codecs
import json
from lxml import etree

# See http://lxml.de/api.html#incremental-xml-generation
# for incremental XML generation used below.


def write_entity(xf, entity):
    start = str(entity['charStart'])
    length = str(entity['charEnd'] - entity['charStart'] + 1)
    text = doc['text'][entity['charStart']:entity['charEnd'] + 1]
    with xf.element('annotation', id=entity['duid']):
        with xf.element('infon', key='type'):
            xf.write(entity['entityType'])
        with xf.element('infon', key='source'):
            xf.write(entity['source'])
        el = etree.Element('location',
                           offset=start,
                           length=length)
        xf.write(el)
        with xf.element('text'):
            xf.write(text)
        for entity_id in entity['entityId']:
            with xf.element('infon', key=entity_id['source']):
                xf.write(entity_id['idString'])

        for attr in entity['attribute']:
            with xf.element('infon', key=attr['key']):
                xf.write(attr['value'])


json_file_path, xml_file_path = sys.argv[1], sys.argv[2]

with codecs.open(json_file_path, 'r', 'utf8') as jf, \
        etree.xmlfile(xml_file_path, encoding='utf8') as xf:
    with xf.element('collection'):
        el = etree.Element('source')
        xf.write(el)
        el = etree.Element('date')
        xf.write(el)
        el = etree.Element('key')
        xf.write(el)
        for line in jf:
            doc = json.loads(line.strip())
            if doc['docId'] != '25789565':
               continue

            sentences = [e for e in doc['entity'] if e['entityType'] == 'SENTENCE']
            # Sort sentences by its character offset.
            sentences = sorted(sentences, key=lambda s: s['charStart'])

            with xf.element('document'):
                with xf.element('id'):
                    xf.write(doc['docId'])
                with xf.element('passage'):
                    with xf.element('offset'):
                        # We only have abstract as the only one passage,
                        # so just write offset as 0 for it.
                        xf.write('0')

                    if len(sentences) == 0:
                        # If there are no sentences, just write the text
                        # and entities.
                        with xf.element('text'):
                            xf.write(doc['text'])

                        for entity in doc['entity']:
                            write_entity(xf, entity)
                    else:
                        # If there are sentences, write the entities to
                        # each corresponding sentence.
                        entities = [e for e in doc['entity'] if e['entityType'] != 'SENTENCE']
                        entities = sorted(entities, key=lambda e: e['charStart'])

                        for sent in sentences:
                            sent_text = doc['text'][sent['charStart']:sent['charEnd']+1]

                            with xf.element('sentence'):
                                with xf.element('offset'):
                                    xf.write(str(sent['charStart']))
                                with xf.element('text'):
                                    xf.write(sent_text)

                                for entity in entities:
                                    if entity['charStart'] > sent['charEnd']:
                                        break
                                    if entity['charStart'] < sent['charStart']:
                                        continue
                                    write_entity(xf, entity)

                    # Write the relations.
                    for relation in doc['relation']:
                        with xf.element('relation', id=relation['duid']):
                            with xf.element('infon', key='relation type'):
                                xf.write(relation['relationType'])
                            with xf.element('infon', key='source'):
                                xf.write(relation['source'])
                            for arg in relation['argument']:
                                el = etree.Element('node', refid=arg['entity_duid'], role=arg['role'])
                                xf.write(el)

                            for attr in relation['attribute']:
                                with xf.element('infon', key=attr['key']):
                                    xf.write(attr['value'])
	from __future__ import unicode_literals
	import sys
	import codecs
	import json
	from lxml import etree

	# See http://lxml.de/api.html#incremental-xml-generation
	# for incremental XML generation used below.


	def write_entity(xf, entity):
	start = str(entity['charStart'])
	length = str(entity['charEnd'] - entity['charStart'] + 1)
	text = doc['text'][entity['charStart']:entity['charEnd'] + 1]
	with xf.element('annotation', id=entity['duid']):
	with xf.element('infon', key='type'):
	xf.write(entity['entityType'])
	with xf.element('infon', key='source'):
	xf.write(entity['source'])
	el = etree.Element('location',
	offset=start,
	length=length)
	xf.write(el)
	with xf.element('text'):
	xf.write(text)
	for entity_id in entity['entityId']:
	with xf.element('infon', key=entity_id['source']):
	xf.write(entity_id['idString'])

	for attr in entity['attribute']:
	with xf.element('infon', key=attr['key']):
	xf.write(attr['value'])


	json_file_path, xml_file_path = sys.argv[1], sys.argv[2]

	with codecs.open(json_file_path, 'r', 'utf8') as jf, \
	etree.xmlfile(xml_file_path, encoding='utf8') as xf:
	with xf.element('collection'):
	el = etree.Element('source')
	xf.write(el)
	el = etree.Element('date')
	xf.write(el)
	el = etree.Element('key')
	xf.write(el)
	for line in jf:
	doc = json.loads(line.strip())
	if doc['docId'] != '25789565':
	continue

	sentences = [e for e in doc['entity'] if e['entityType'] == 'SENTENCE']
	# Sort sentences by its character offset.
	sentences = sorted(sentences, key=lambda s: s['charStart'])

	with xf.element('document'):
	with xf.element('id'):
	xf.write(doc['docId'])
	with xf.element('passage'):
	with xf.element('offset'):
	# We only have abstract as the only one passage,
	# so just write offset as 0 for it.
	xf.write('0')

	if len(sentences) == 0:
	# If there are no sentences, just write the text
	# and entities.
	with xf.element('text'):
	xf.write(doc['text'])

	for entity in doc['entity']:
	write_entity(xf, entity)
	else:
	# If there are sentences, write the entities to
	# each corresponding sentence.
	entities = [e for e in doc['entity'] if e['entityType'] != 'SENTENCE']
	entities = sorted(entities, key=lambda e: e['charStart'])

	for sent in sentences:
	sent_text = doc['text'][sent['charStart']:sent['charEnd']+1]

	with xf.element('sentence'):
	with xf.element('offset'):
	xf.write(str(sent['charStart']))
	with xf.element('text'):
	xf.write(sent_text)

	for entity in entities:
	if entity['charStart'] > sent['charEnd']:
	break
	if entity['charStart'] < sent['charStart']:
	continue
	write_entity(xf, entity)

	# Write the relations.
	for relation in doc['relation']:
	with xf.element('relation', id=relation['duid']):
	with xf.element('infon', key='relation type'):
	xf.write(relation['relationType'])
	with xf.element('infon', key='source'):
	xf.write(relation['source'])
	for arg in relation['argument']:
	el = etree.Element('node', refid=arg['entity_duid'], role=arg['role'])
	xf.write(el)

	for attr in relation['attribute']:
	with xf.element('infon', key=attr['key']):
	xf.write(attr['value'])