Created
October 8, 2013 08:15
-
-
Save cordarei/6881311 to your computer and use it in GitHub Desktop.
Converting annotations to brat format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Convert a document in the corpus to the brat ANN format. | |
Call like 'doc2ann.py <path>/<document id>'. Outputs two files in the current | |
directory <file>.txt and <file>.ann, where <file> is names from the command-line | |
argument. | |
""" | |
import argparse | |
import json | |
import operator | |
import sys | |
from itertools import count | |
def accumulate(iterable, func=operator.add): | |
'Return running totals' | |
# accumulate([1,2,3,4,5]) --> 1 3 6 10 15 | |
# accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120 | |
it = iter(iterable) | |
total = next(it) | |
yield total | |
for element in it: | |
total = func(total, element) | |
yield total | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') | |
parser.add_argument('base_file_path') | |
args = parser.parse_args() | |
#Input file paths | |
base_file_path = args.base_file_path | |
sentence_file_path = '.'.join((base_file_path, 'sentences')) | |
pos_file_path = '.'.join((base_file_path, 'pos')) | |
dep_file_path = '.'.join((base_file_path, 'dep')) | |
#Read in sentences, POS, and dependencies | |
sentences = [[w for w in l.split() if w.strip()] | |
for l in open(sentence_file_path) if l.strip()] | |
pos_strings = [l for l in open(pos_file_path)] | |
dep_strings = [l for l in open(dep_file_path)] | |
#Calculate output file names | |
base_out_name = '_'.join( base_file_path.split('/')[-4:] ) | |
txt_out_name = '.'.join((base_out_name, 'txt')) | |
ann_out_name = '.'.join((base_out_name, 'ann')) | |
# Add a dummy root token to each sentence | |
sentences = [ ['ROOT'] + s for s in sentences ] | |
# Calculate offsets (with extra 1 for newline char) | |
sentence_lengths = [ len(' '.join(s)) + 1 for s in sentences ] | |
sentence_offsets = [0] + list(accumulate(sentence_lengths[:-1])) | |
# within-sentence offsets (with extra 1 for space) | |
token_lengths = [ [ len(w) + 1 for w in s ] for s in sentences ] | |
token_offsets = [ [0] + list(accumulate(tls[:-1])) for tls in token_lengths ] | |
# Dump the sentences to *.txt | |
with open(txt_out_name, 'w') as txt_out: | |
for s in sentences: | |
print >>txt_out, ' '.join(s) | |
# Output annotations to *.ann | |
# POS replacement dict | |
pos_replacements = { | |
"''": "QUOTE_CLOSE", | |
"``": "QUOTE_OPEN", | |
'PRP': 'PRON', | |
'PRP$': 'PRON_POS', | |
',': 'COMMA', | |
':': 'PUNCT', | |
'.': 'PERIOD', | |
'$': 'MONEY', | |
'WP$': 'WP_POS', | |
} | |
# ID generators | |
def token_id_gen(): | |
for n in count(1): | |
yield "T{}".format(n) | |
token_ids = token_id_gen() | |
def relation_id_gen(): | |
for n in count(1): | |
yield "R{}".format(n) | |
relation_ids = relation_id_gen() | |
ann_out = open(ann_out_name, 'w') | |
for i, ws, pstr, dstr in zip(count(0), sentences, pos_strings, dep_strings): | |
try: | |
#get offsets | |
soff = sentence_offsets[i] | |
woffs = token_offsets[i] | |
#parse POS and dependency annotations | |
ps = json.loads(pstr) | |
ds = json.loads(dstr) | |
#correct POS for dummy token | |
ps = ['ROOT'] + ps | |
#correct dependency token indices for dummy token | |
ds = [ (r, h+1, d+1) for r, h, d in ds ] | |
#create ANN tokens | |
ann_tokens = [ | |
dict( | |
tid=t, | |
pos=pos_replacements.get(p, p), | |
word=w, | |
start=soff + b, | |
end=soff + b + len(w) #ANN offsets are one-past-end | |
) | |
for t, p, b, w in zip(token_ids, ps, woffs, ws) | |
] | |
ann_relations = [ | |
dict( | |
rid=k, | |
rel=r, | |
hid=ann_tokens[h]['tid'], | |
did=ann_tokens[d]['tid'] | |
) | |
for k, r, h, d in [(k,) + tup for k,tup in zip(relation_ids, ds)] | |
] | |
#output annotations | |
for tok in ann_tokens: | |
print >>ann_out, "{tid}\t{pos} {start} {end}\t{word}".format(**tok) | |
for rel in ann_relations: | |
print >>ann_out, "{rid}\t{rel} Arg1:{hid} Arg2:{did}".format(**rel) | |
except Exception as e: | |
# print >>sys.stderr, '{file}:{sent}:Exception: {ex}'.format(file=base_file_path, | |
# sent=i, | |
# ex=e) | |
# sys.exit(1) | |
raise |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi!
Yeah, I know the script isn't exactly useful as is -- it's meant to show an example of outputting the format brat expects. The most interesting parts are probably calculating the character offsets for the part-of-speech annotations and creating a unique ID for each annotation.
There are three input files: a file with tokenized text, one sentence per line; a file with POS tags (each line is a list of POS tags corresponding to a sentence in the text file; and a file with dependency relations (each line is a list of 3-element [label, head_index, daughter_index] lists, one for each sentence). If the CoreNLP output has character offsets you really don't need anything here, you can just read the XML and output the annotations (see lines 141-145 for the format). I ended up with this format because JSON is a little bit easier to deal with than XML.