-
-
Save cordarei/6881311 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
""" | |
Convert a document in the corpus to the brat ANN format. | |
Call like 'doc2ann.py <path>/<document id>'. Outputs two files in the current | |
directory <file>.txt and <file>.ann, where <file> is names from the command-line | |
argument. | |
""" | |
import argparse | |
import json | |
import operator | |
import sys | |
from itertools import count | |
def accumulate(iterable, func=operator.add): | |
'Return running totals' | |
# accumulate([1,2,3,4,5]) --> 1 3 6 10 15 | |
# accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120 | |
it = iter(iterable) | |
total = next(it) | |
yield total | |
for element in it: | |
total = func(total, element) | |
yield total | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') | |
parser.add_argument('base_file_path') | |
args = parser.parse_args() | |
#Input file paths | |
base_file_path = args.base_file_path | |
sentence_file_path = '.'.join((base_file_path, 'sentences')) | |
pos_file_path = '.'.join((base_file_path, 'pos')) | |
dep_file_path = '.'.join((base_file_path, 'dep')) | |
#Read in sentences, POS, and dependencies | |
sentences = [[w for w in l.split() if w.strip()] | |
for l in open(sentence_file_path) if l.strip()] | |
pos_strings = [l for l in open(pos_file_path)] | |
dep_strings = [l for l in open(dep_file_path)] | |
#Calculate output file names | |
base_out_name = '_'.join( base_file_path.split('/')[-4:] ) | |
txt_out_name = '.'.join((base_out_name, 'txt')) | |
ann_out_name = '.'.join((base_out_name, 'ann')) | |
# Add a dummy root token to each sentence | |
sentences = [ ['ROOT'] + s for s in sentences ] | |
# Calculate offsets (with extra 1 for newline char) | |
sentence_lengths = [ len(' '.join(s)) + 1 for s in sentences ] | |
sentence_offsets = [0] + list(accumulate(sentence_lengths[:-1])) | |
# within-sentence offsets (with extra 1 for space) | |
token_lengths = [ [ len(w) + 1 for w in s ] for s in sentences ] | |
token_offsets = [ [0] + list(accumulate(tls[:-1])) for tls in token_lengths ] | |
# Dump the sentences to *.txt | |
with open(txt_out_name, 'w') as txt_out: | |
for s in sentences: | |
print >>txt_out, ' '.join(s) | |
# Output annotations to *.ann | |
# POS replacement dict | |
pos_replacements = { | |
"''": "QUOTE_CLOSE", | |
"``": "QUOTE_OPEN", | |
'PRP': 'PRON', | |
'PRP$': 'PRON_POS', | |
',': 'COMMA', | |
':': 'PUNCT', | |
'.': 'PERIOD', | |
'$': 'MONEY', | |
'WP$': 'WP_POS', | |
} | |
# ID generators | |
def token_id_gen(): | |
for n in count(1): | |
yield "T{}".format(n) | |
token_ids = token_id_gen() | |
def relation_id_gen(): | |
for n in count(1): | |
yield "R{}".format(n) | |
relation_ids = relation_id_gen() | |
ann_out = open(ann_out_name, 'w') | |
for i, ws, pstr, dstr in zip(count(0), sentences, pos_strings, dep_strings): | |
try: | |
#get offsets | |
soff = sentence_offsets[i] | |
woffs = token_offsets[i] | |
#parse POS and dependency annotations | |
ps = json.loads(pstr) | |
ds = json.loads(dstr) | |
#correct POS for dummy token | |
ps = ['ROOT'] + ps | |
#correct dependency token indices for dummy token | |
ds = [ (r, h+1, d+1) for r, h, d in ds ] | |
#create ANN tokens | |
ann_tokens = [ | |
dict( | |
tid=t, | |
pos=pos_replacements.get(p, p), | |
word=w, | |
start=soff + b, | |
end=soff + b + len(w) #ANN offsets are one-past-end | |
) | |
for t, p, b, w in zip(token_ids, ps, woffs, ws) | |
] | |
ann_relations = [ | |
dict( | |
rid=k, | |
rel=r, | |
hid=ann_tokens[h]['tid'], | |
did=ann_tokens[d]['tid'] | |
) | |
for k, r, h, d in [(k,) + tup for k,tup in zip(relation_ids, ds)] | |
] | |
#output annotations | |
for tok in ann_tokens: | |
print >>ann_out, "{tid}\t{pos} {start} {end}\t{word}".format(**tok) | |
for rel in ann_relations: | |
print >>ann_out, "{rid}\t{rel} Arg1:{hid} Arg2:{did}".format(**rel) | |
except Exception as e: | |
# print >>sys.stderr, '{file}:{sent}:Exception: {ex}'.format(file=base_file_path, | |
# sent=i, | |
# ex=e) | |
# sys.exit(1) | |
raise |
Hi!
Yeah, I know the script isn't exactly useful as is -- it's meant to show an example of outputting the format brat expects. The most interesting parts are probably calculating the character offsets for the part-of-speech annotations and creating a unique ID for each annotation.
There are three input files: a file with tokenized text, one sentence per line; a file with POS tags (each line is a list of POS tags corresponding to a sentence in the text file; and a file with dependency relations (each line is a list of 3-element [label, head_index, daughter_index] lists, one for each sentence). If the CoreNLP output has character offsets you really don't need anything here, you can just read the XML and output the annotations (see lines 141-145 for the format). I ended up with this format because JSON is a little bit easier to deal with than XML.
Thanks for putting this up. We are trying to take the XML output from CoreNLP and feed it into brat (locally on our system - we can't do it online). It looks like your script expects 4 input files and I can't figure out what the .pos .sentences and .dep files are or what generates them. When we feed text to CoreNLP all we get back is the .xml file.
Any help greatly appreciated.
-JK Scheinberg