zyocum/data.json Secret

## data.json
{"data": "\nThis is some data .\n\n\nSometimes tags in the data are nested .\n\n\nIn addition to nested tags , sometimes there is also junk we need to ignore .\n\nJunk is marked by uppercase characters between asterisks and can also optionally be followed by a dash and then one or more digits . \n\nNote that *this* is just emphasized . It's not junk !\n\n", "entities": [{"id": 0, "mentions": [{"start": 9, "end": 18, "id": 0, "text": "some data"}, {"start": 41, "end": 49, "id": 0, "text": "the data"}]}, {"id": 1, "mentions": [{"start": 33, "end": 60, "id": 1, "text": "tags in the data are nested"}, {"start": 80, "end": 91, "id": 1, "text": "nested tags"}]}, {"id": 2, "mentions": [{"start": 118, "end": 122, "id": 2, "text": "junk"}, {"start": 144, "end": 148, "id": 2, "text": "Junk"}, {"start": 326, "end": 330, "id": 2, "text": "junk"}]}, {"id": 3, "mentions": [{"start": 1, "end": 5, "id": 3, "text": "This"}]}, {"id": 4, "mentions": [{"start": 289, "end": 295, "id": 4, "text": "*this*"}]}]}

## data.xml
<DOC>
<TEXT PARTNO="000">
<TAG ID="3">This</TAG> is <TAG ID="0">some *JUNK* data</TAG> .
</TEXT>
<TEXT PARTNO="001">
*FOO* Sometimes <TAG ID="1">tags in <TAG ID="0">the data</TAG> are nested</TAG> .
</TEXT>
<TEXT PARTNO="002">
In addition to <TAG ID="1">nested tags</TAG> , sometimes there is also <TAG ID="2">junk</TAG> we need to ignore .
</TEXT>
<TEXT PARTNO="003">*BAR*-1
<TAG ID="2">Junk</TAG> is marked by uppercase characters between asterisks and can also optionally be followed by a dash and then one or more digits . *JUNK*-123
</TEXT>
<TEXT PARTNO="004">
Note that <TAG ID="4">*this*</TAG> is just emphasized . It's not <TAG ID="2">junk</TAG> !
</TEXT>
</DOC>

## xml2json.py
#!/usr/bin/env python3

"""Parse XML and bookkeep textual character offsets of selected tag elements"""

import json
import re
import sys

from itertools import groupby
from operator import itemgetter

from lxml import etree

def load_etree(filename):
    """Load an XML element tree from XML .coref file"""
    with open(filename, mode='r') as f:
        return etree.fromstring(f.read())

def extent(obj):
    """Get the start and end offset attributes of a dict-like object

    a = {'start': 0, 'end': 5}
    b = {'start': 0, 'end': 10}
    c = {'start': 5, 'end': 10}

    extent(a) -> (0, 5)
    extent(b) -> (0, 10)
    extent(c) -> (5, 10)
    extent({}) -> (-1, -1)

    """
    return obj.get('start', -1), obj.get('end', -1)

JUNK = re.compile(
    r'''
    \*[A-Z?]+\*\s* # junk includes uppercase letters between asterisks
    (?:-\d+\s*)?   # sometimes junk has a trailing dash and a numerical ID
    ''',
    re.X
)

def remove_junk(data):
    """Generate text chunks with junk removed."""
    cursor = 0
    for match in JUNK.finditer(data):
        start, end = match.span()
        yield data[cursor:start]
        cursor = end
    yield data[cursor:]

def clean(data):
    """Remove junk from text."""
    return ''.join([] if data is None else remove_junk(data))

def add_text(tag, data):
    """Add text attribute to a tag by slicing the data at the appropriate
    character offsets."""
    start, end = extent(tag)
    tag['text'] = data[start:end]
    return tag

def entities(tags, data):
    """Chain entities together that have the same identifier."""
    grouped = groupby(
        sorted(tags, key=itemgetter('id')),
        key=itemgetter('id')
    )
    for _, group in grouped:
        group = list(group)
        first, *rest = group
        yield {
            'id': first['id'],
            'mentions': [add_text(tag, data) for tag in group]
        }

def document(filename, tag_types):
    """Convert XML from a given file to JSON by cleaning text and
    keeping track of character offsets of selected tag elements."""
    chunks, stack, tags = [], [], []
    cursor, id_ = 0, 0
    for event, element in etree.iterwalk(
        load_etree(filename),
        events=('start', 'end')
    ):
        if element.tag in tag_types:
            if event == 'start':
                text = clean(element.text)
                if element.tag not in {'TEXT', 'DOC'}:
                    tag_id = element.attrib.get('ID')
                    if tag_id is None:
                        tag_id = id_
                        id_ += 1
                    tag = {
                        'start': cursor,
                        'end': None,
                        'id': int(tag_id)
                    }
                    stack.append(tag)
                chunks.append(text)
                cursor += len(text)
            if event == 'end':
                if element.tag not in {'TEXT', 'DOC'}:
                    tag = stack.pop()
                    tag['end'] = cursor
                    tags.append(tag)
                if element.tag is not None:
                    text = clean(element.tail)
                    chunks.append(text)
                    cursor += len(text)
    data = ''.join(chunks)
    return {
        'data': data,
        'entities': list(entities(tags, data))
    }

def main(filename, tag_types):
    print(json.dumps(document(filename, tag_types), ensure_ascii=False))

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__
    )
    parser.add_argument(
        'input',
        help='path to an input XML file'
    )
    parser.add_argument(
        '-t', '--tag-types',
        nargs='+',
        default=['TEXT', 'TAG'],
        help='set of XML tag types to parse'
    )
    args = parser.parse_args()
    main(args.input, args.tag_types)
	<DOC>
	<TEXT PARTNO="000">
	<TAG ID="3">This</TAG> is <TAG ID="0">some JUNK data</TAG> .
	</TEXT>
	<TEXT PARTNO="001">
	FOO Sometimes <TAG ID="1">tags in <TAG ID="0">the data</TAG> are nested</TAG> .
	</TEXT>
	<TEXT PARTNO="002">
	In addition to <TAG ID="1">nested tags</TAG> , sometimes there is also <TAG ID="2">junk</TAG> we need to ignore .
	</TEXT>
	<TEXT PARTNO="003">BAR-1
	<TAG ID="2">Junk</TAG> is marked by uppercase characters between asterisks and can also optionally be followed by a dash and then one or more digits . JUNK-123
	</TEXT>
	<TEXT PARTNO="004">
	Note that <TAG ID="4">this</TAG> is just emphasized . It's not <TAG ID="2">junk</TAG> !
	</TEXT>
	</DOC>
	#!/usr/bin/env python3

	"""Parse XML and bookkeep textual character offsets of selected tag elements"""

	import json
	import re
	import sys

	from itertools import groupby
	from operator import itemgetter

	from lxml import etree

	def load_etree(filename):
	"""Load an XML element tree from XML .coref file"""
	with open(filename, mode='r') as f:
	return etree.fromstring(f.read())

	def extent(obj):
	"""Get the start and end offset attributes of a dict-like object

	a = {'start': 0, 'end': 5}
	b = {'start': 0, 'end': 10}
	c = {'start': 5, 'end': 10}

	extent(a) -> (0, 5)
	extent(b) -> (0, 10)
	extent(c) -> (5, 10)
	extent({}) -> (-1, -1)

	"""
	return obj.get('start', -1), obj.get('end', -1)

	JUNK = re.compile(
	r'''
	\[A-Z?]+\\s* # junk includes uppercase letters between asterisks
	(?:-\d+\s*)? # sometimes junk has a trailing dash and a numerical ID
	''',
	re.X
	)

	def remove_junk(data):
	"""Generate text chunks with junk removed."""
	cursor = 0
	for match in JUNK.finditer(data):
	start, end = match.span()
	yield data[cursor:start]
	cursor = end
	yield data[cursor:]

	def clean(data):
	"""Remove junk from text."""
	return ''.join([] if data is None else remove_junk(data))

	def add_text(tag, data):
	"""Add text attribute to a tag by slicing the data at the appropriate
	character offsets."""
	start, end = extent(tag)
	tag['text'] = data[start:end]
	return tag

	def entities(tags, data):
	"""Chain entities together that have the same identifier."""
	grouped = groupby(
	sorted(tags, key=itemgetter('id')),
	key=itemgetter('id')
	)
	for _, group in grouped:
	group = list(group)
	first, *rest = group
	yield {
	'id': first['id'],
	'mentions': [add_text(tag, data) for tag in group]
	}

	def document(filename, tag_types):
	"""Convert XML from a given file to JSON by cleaning text and
	keeping track of character offsets of selected tag elements."""
	chunks, stack, tags = [], [], []
	cursor, id_ = 0, 0
	for event, element in etree.iterwalk(
	load_etree(filename),
	events=('start', 'end')
	):
	if element.tag in tag_types:
	if event == 'start':
	text = clean(element.text)
	if element.tag not in {'TEXT', 'DOC'}:
	tag_id = element.attrib.get('ID')
	if tag_id is None:
	tag_id = id_
	id_ += 1
	tag = {
	'start': cursor,
	'end': None,
	'id': int(tag_id)
	}
	stack.append(tag)
	chunks.append(text)
	cursor += len(text)
	if event == 'end':
	if element.tag not in {'TEXT', 'DOC'}:
	tag = stack.pop()
	tag['end'] = cursor
	tags.append(tag)
	if element.tag is not None:
	text = clean(element.tail)
	chunks.append(text)
	cursor += len(text)
	data = ''.join(chunks)
	return {
	'data': data,
	'entities': list(entities(tags, data))
	}

	def main(filename, tag_types):
	print(json.dumps(document(filename, tag_types), ensure_ascii=False))

	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	description=__doc__
	)
	parser.add_argument(
	'input',
	help='path to an input XML file'
	)
	parser.add_argument(
	'-t', '--tag-types',
	nargs='+',
	default=['TEXT', 'TAG'],
	help='set of XML tag types to parse'
	)
	args = parser.parse_args()
	main(args.input, args.tag_types)