Skip to content

Instantly share code, notes, and snippets.

@zyocum

zyocum/data.json Secret

Last active October 20, 2018 05:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zyocum/f49d28e56053eaa025592858bddfc420 to your computer and use it in GitHub Desktop.
Save zyocum/f49d28e56053eaa025592858bddfc420 to your computer and use it in GitHub Desktop.
{"data": "\nThis is some data .\n\n\nSometimes tags in the data are nested .\n\n\nIn addition to nested tags , sometimes there is also junk we need to ignore .\n\nJunk is marked by uppercase characters between asterisks and can also optionally be followed by a dash and then one or more digits . \n\nNote that *this* is just emphasized . It's not junk !\n\n", "entities": [{"id": 0, "mentions": [{"start": 9, "end": 18, "id": 0, "text": "some data"}, {"start": 41, "end": 49, "id": 0, "text": "the data"}]}, {"id": 1, "mentions": [{"start": 33, "end": 60, "id": 1, "text": "tags in the data are nested"}, {"start": 80, "end": 91, "id": 1, "text": "nested tags"}]}, {"id": 2, "mentions": [{"start": 118, "end": 122, "id": 2, "text": "junk"}, {"start": 144, "end": 148, "id": 2, "text": "Junk"}, {"start": 326, "end": 330, "id": 2, "text": "junk"}]}, {"id": 3, "mentions": [{"start": 1, "end": 5, "id": 3, "text": "This"}]}, {"id": 4, "mentions": [{"start": 289, "end": 295, "id": 4, "text": "*this*"}]}]}
<DOC>
<TEXT PARTNO="000">
<TAG ID="3">This</TAG> is <TAG ID="0">some *JUNK* data</TAG> .
</TEXT>
<TEXT PARTNO="001">
*FOO* Sometimes <TAG ID="1">tags in <TAG ID="0">the data</TAG> are nested</TAG> .
</TEXT>
<TEXT PARTNO="002">
In addition to <TAG ID="1">nested tags</TAG> , sometimes there is also <TAG ID="2">junk</TAG> we need to ignore .
</TEXT>
<TEXT PARTNO="003">*BAR*-1
<TAG ID="2">Junk</TAG> is marked by uppercase characters between asterisks and can also optionally be followed by a dash and then one or more digits . *JUNK*-123
</TEXT>
<TEXT PARTNO="004">
Note that <TAG ID="4">*this*</TAG> is just emphasized . It's not <TAG ID="2">junk</TAG> !
</TEXT>
</DOC>
#!/usr/bin/env python3
"""Parse XML and bookkeep textual character offsets of selected tag elements"""
import json
import re
import sys
from itertools import groupby
from operator import itemgetter
from lxml import etree
def load_etree(filename):
"""Load an XML element tree from XML .coref file"""
with open(filename, mode='r') as f:
return etree.fromstring(f.read())
def extent(obj):
"""Get the start and end offset attributes of a dict-like object
a = {'start': 0, 'end': 5}
b = {'start': 0, 'end': 10}
c = {'start': 5, 'end': 10}
extent(a) -> (0, 5)
extent(b) -> (0, 10)
extent(c) -> (5, 10)
extent({}) -> (-1, -1)
"""
return obj.get('start', -1), obj.get('end', -1)
JUNK = re.compile(
r'''
\*[A-Z?]+\*\s* # junk includes uppercase letters between asterisks
(?:-\d+\s*)? # sometimes junk has a trailing dash and a numerical ID
''',
re.X
)
def remove_junk(data):
"""Generate text chunks with junk removed."""
cursor = 0
for match in JUNK.finditer(data):
start, end = match.span()
yield data[cursor:start]
cursor = end
yield data[cursor:]
def clean(data):
"""Remove junk from text."""
return ''.join([] if data is None else remove_junk(data))
def add_text(tag, data):
"""Add text attribute to a tag by slicing the data at the appropriate
character offsets."""
start, end = extent(tag)
tag['text'] = data[start:end]
return tag
def entities(tags, data):
"""Chain entities together that have the same identifier."""
grouped = groupby(
sorted(tags, key=itemgetter('id')),
key=itemgetter('id')
)
for _, group in grouped:
group = list(group)
first, *rest = group
yield {
'id': first['id'],
'mentions': [add_text(tag, data) for tag in group]
}
def document(filename, tag_types):
"""Convert XML from a given file to JSON by cleaning text and
keeping track of character offsets of selected tag elements."""
chunks, stack, tags = [], [], []
cursor, id_ = 0, 0
for event, element in etree.iterwalk(
load_etree(filename),
events=('start', 'end')
):
if element.tag in tag_types:
if event == 'start':
text = clean(element.text)
if element.tag not in {'TEXT', 'DOC'}:
tag_id = element.attrib.get('ID')
if tag_id is None:
tag_id = id_
id_ += 1
tag = {
'start': cursor,
'end': None,
'id': int(tag_id)
}
stack.append(tag)
chunks.append(text)
cursor += len(text)
if event == 'end':
if element.tag not in {'TEXT', 'DOC'}:
tag = stack.pop()
tag['end'] = cursor
tags.append(tag)
if element.tag is not None:
text = clean(element.tail)
chunks.append(text)
cursor += len(text)
data = ''.join(chunks)
return {
'data': data,
'entities': list(entities(tags, data))
}
def main(filename, tag_types):
print(json.dumps(document(filename, tag_types), ensure_ascii=False))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=__doc__
)
parser.add_argument(
'input',
help='path to an input XML file'
)
parser.add_argument(
'-t', '--tag-types',
nargs='+',
default=['TEXT', 'TAG'],
help='set of XML tag types to parse'
)
args = parser.parse_args()
main(args.input, args.tag_types)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment