Skip to content

Instantly share code, notes, and snippets.

@larsmans
Last active August 29, 2015 14:10
Show Gist options
  • Save larsmans/8a5d1f5e272674e1b7f9 to your computer and use it in GitHub Desktop.
Save larsmans/8a5d1f5e272674e1b7f9 to your computer and use it in GitHub Desktop.
Brat-to-CSV converter
# Quick and dirty Brat-to-CSV conversion.
from __future__ import print_function
import csv
import io
import re
import sys
# copy server/src/{gtbtokenize,tokenise}.py from Brat
from tokenise import gtb_token_boundary_gen
def read_annot(fname):
ann = {}
level = {}
with open(fname) as f:
for ln in f:
if ln.startswith('T'):
ident, label, start, end, text = ln.split(None, 4)
start, end = int(start), int(end)
for t_start, t_end in gtb_token_boundary_gen(text):
# Index annotations by token start only, because it's
# too hard to get the tokenizer to behave just like it
# does in Brat and the ends tend to go wrong.
#ann[(start + int(t_start), start + int(t_end))] = label
ann[start + int(t_start)] = (label, int(t_end), ident)
else:
try:
_, lev, ident, value = ln.split()
if lev != 'Level':
continue
level[ident] = value
except ValueError:
pass
return ann, level
if len(sys.argv) != 4:
print("usage: %s review.txt user1.ann user2.ann" % sys.argv[0],
file=sys.stderr)
sys.exit(1)
txt_name, ann1_name, ann2_name = sys.argv[1:]
with io.open(txt_name, encoding='utf-8') as f:
text = f.read()
tok_bound = list(gtb_token_boundary_gen(text))
ann1, level1 = read_annot(ann1_name)
ann2, level2 = read_annot(ann2_name)
wr = csv.writer(sys.stdout, dialect='excel')
wr.writerow(['Token',
#'Start', 'End',
'Label1', 'Level1', 'Label2', 'Level2'])
for o in tok_bound:
start, end = o
label1, end1, ident1 = ann1.get(start, ('', 0, ''))
label2, end2, ident2 = ann2.get(start, ('', 0, ''))
lvl1 = level1.get(ident1, '')
lvl2 = level2.get(ident2, '')
wr.writerow([text[start:end].encode('utf-8'),
#start, end,
label1, lvl1, label2, lvl2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment