cordarei/doc2ann.py

## doc2ann.py
#!/usr/bin/env python


"""
Convert a document in the corpus to the brat ANN format.

Call like 'doc2ann.py <path>/<document id>'. Outputs two files in the current
directory <file>.txt and <file>.ann, where <file> is names from the command-line
argument.
"""


import argparse
import json
import operator
import sys

from itertools import count


def accumulate(iterable, func=operator.add):
    'Return running totals'
    # accumulate([1,2,3,4,5]) --> 1 3 6 10 15
    # accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
    it = iter(iterable)
    total = next(it)
    yield total
    for element in it:
        total = func(total, element)
        yield total


parser = argparse.ArgumentParser()
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
parser.add_argument('base_file_path')
args = parser.parse_args()

#Input file paths
base_file_path = args.base_file_path
sentence_file_path = '.'.join((base_file_path, 'sentences'))
pos_file_path = '.'.join((base_file_path, 'pos'))
dep_file_path = '.'.join((base_file_path, 'dep'))

#Read in sentences, POS, and dependencies
sentences = [[w for w in l.split() if w.strip()]
             for l in open(sentence_file_path) if l.strip()]
pos_strings = [l for l in open(pos_file_path)]
dep_strings = [l for l in open(dep_file_path)]

#Calculate output file names
base_out_name = '_'.join( base_file_path.split('/')[-4:] )
txt_out_name = '.'.join((base_out_name, 'txt'))
ann_out_name = '.'.join((base_out_name, 'ann'))


# Add a dummy root token to each sentence
sentences = [ ['ROOT'] + s for s in sentences ]


# Calculate offsets (with extra 1 for newline char)
sentence_lengths = [ len(' '.join(s)) + 1 for s in sentences ]
sentence_offsets = [0] + list(accumulate(sentence_lengths[:-1]))
#   within-sentence offsets (with extra 1 for space)
token_lengths = [ [ len(w) + 1 for w in s ] for s in sentences ]
token_offsets = [ [0] + list(accumulate(tls[:-1])) for tls in token_lengths ]


# Dump the sentences to *.txt

with open(txt_out_name, 'w') as txt_out:
    for s in sentences:
        print >>txt_out, ' '.join(s)


# Output annotations to *.ann

#   POS replacement dict
pos_replacements = {
    "''": "QUOTE_CLOSE",
    "``": "QUOTE_OPEN",
    'PRP': 'PRON',
    'PRP$': 'PRON_POS',
    ',': 'COMMA',
    ':': 'PUNCT',
    '.': 'PERIOD',
    '$': 'MONEY',
    'WP$': 'WP_POS',
}

#   ID generators
def token_id_gen():
    for n in count(1):
        yield "T{}".format(n)
token_ids = token_id_gen()

def relation_id_gen():
    for n in count(1):
        yield "R{}".format(n)
relation_ids = relation_id_gen()

ann_out = open(ann_out_name, 'w')

for i, ws, pstr, dstr in zip(count(0), sentences, pos_strings, dep_strings):
    try:
        #get offsets
        soff = sentence_offsets[i]
        woffs = token_offsets[i]

        #parse POS and dependency annotations
        ps = json.loads(pstr)
        ds = json.loads(dstr)

        #correct POS for dummy token
        ps = ['ROOT'] + ps

        #correct dependency token indices for dummy token
        ds = [ (r, h+1, d+1) for r, h, d in ds ]

        #create ANN tokens
        ann_tokens = [
            dict(
                tid=t,
                pos=pos_replacements.get(p, p),
                word=w,
                start=soff + b,
                end=soff + b + len(w) #ANN offsets are one-past-end
            )
            for t, p, b, w in zip(token_ids, ps, woffs, ws)
        ]
        ann_relations = [
            dict(
                rid=k,
                rel=r,
                hid=ann_tokens[h]['tid'],
                did=ann_tokens[d]['tid']
            )
            for k, r, h, d in [(k,) + tup for k,tup in zip(relation_ids, ds)]
        ]

        #output annotations
        for tok in ann_tokens:
            print >>ann_out, "{tid}\t{pos} {start} {end}\t{word}".format(**tok)
        for rel in ann_relations:
            print >>ann_out, "{rid}\t{rel} Arg1:{hid} Arg2:{did}".format(**rel)

    except Exception as e:
        # print >>sys.stderr, '{file}:{sent}:Exception: {ex}'.format(file=base_file_path,
        #                                                  sent=i,
        #                                                  ex=e)
        # sys.exit(1)
        raise
	#!/usr/bin/env python


	"""
	Convert a document in the corpus to the brat ANN format.

	Call like 'doc2ann.py <path>/<document id>'. Outputs two files in the current
	directory <file>.txt and <file>.ann, where <file> is names from the command-line
	argument.
	"""


	import argparse
	import json
	import operator
	import sys

	from itertools import count


	def accumulate(iterable, func=operator.add):
	'Return running totals'
	# accumulate([1,2,3,4,5]) --> 1 3 6 10 15
	# accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
	it = iter(iterable)
	total = next(it)
	yield total
	for element in it:
	total = func(total, element)
	yield total



	parser = argparse.ArgumentParser()
	parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
	parser.add_argument('base_file_path')
	args = parser.parse_args()

	#Input file paths
	base_file_path = args.base_file_path
	sentence_file_path = '.'.join((base_file_path, 'sentences'))
	pos_file_path = '.'.join((base_file_path, 'pos'))
	dep_file_path = '.'.join((base_file_path, 'dep'))

	#Read in sentences, POS, and dependencies
	sentences = [[w for w in l.split() if w.strip()]
	for l in open(sentence_file_path) if l.strip()]
	pos_strings = [l for l in open(pos_file_path)]
	dep_strings = [l for l in open(dep_file_path)]

	#Calculate output file names
	base_out_name = '_'.join( base_file_path.split('/')[-4:] )
	txt_out_name = '.'.join((base_out_name, 'txt'))
	ann_out_name = '.'.join((base_out_name, 'ann'))


	# Add a dummy root token to each sentence
	sentences = [ ['ROOT'] + s for s in sentences ]


	# Calculate offsets (with extra 1 for newline char)
	sentence_lengths = [ len(' '.join(s)) + 1 for s in sentences ]
	sentence_offsets = [0] + list(accumulate(sentence_lengths[:-1]))
	# within-sentence offsets (with extra 1 for space)
	token_lengths = [ [ len(w) + 1 for w in s ] for s in sentences ]
	token_offsets = [ [0] + list(accumulate(tls[:-1])) for tls in token_lengths ]


	# Dump the sentences to *.txt

	with open(txt_out_name, 'w') as txt_out:
	for s in sentences:
	print >>txt_out, ' '.join(s)


	# Output annotations to *.ann

	# POS replacement dict
	pos_replacements = {
	"''": "QUOTE_CLOSE",
	"``": "QUOTE_OPEN",
	'PRP': 'PRON',
	'PRP$': 'PRON_POS',
	',': 'COMMA',
	':': 'PUNCT',
	'.': 'PERIOD',
	'$': 'MONEY',
	'WP$': 'WP_POS',
	}

	# ID generators
	def token_id_gen():
	for n in count(1):
	yield "T{}".format(n)
	token_ids = token_id_gen()

	def relation_id_gen():
	for n in count(1):
	yield "R{}".format(n)
	relation_ids = relation_id_gen()

	ann_out = open(ann_out_name, 'w')

	for i, ws, pstr, dstr in zip(count(0), sentences, pos_strings, dep_strings):
	try:
	#get offsets
	soff = sentence_offsets[i]
	woffs = token_offsets[i]

	#parse POS and dependency annotations
	ps = json.loads(pstr)
	ds = json.loads(dstr)

	#correct POS for dummy token
	ps = ['ROOT'] + ps

	#correct dependency token indices for dummy token
	ds = [ (r, h+1, d+1) for r, h, d in ds ]

	#create ANN tokens
	ann_tokens = [
	dict(
	tid=t,
	pos=pos_replacements.get(p, p),
	word=w,
	start=soff + b,
	end=soff + b + len(w) #ANN offsets are one-past-end
	)
	for t, p, b, w in zip(token_ids, ps, woffs, ws)
	]
	ann_relations = [
	dict(
	rid=k,
	rel=r,
	hid=ann_tokens[h]['tid'],
	did=ann_tokens[d]['tid']
	)
	for k, r, h, d in [(k,) + tup for k,tup in zip(relation_ids, ds)]
	]

	#output annotations
	for tok in ann_tokens:
	print >>ann_out, "{tid}\t{pos} {start} {end}\t{word}".format(**tok)
	for rel in ann_relations:
	print >>ann_out, "{rid}\t{rel} Arg1:{hid} Arg2:{did}".format(**rel)

	except Exception as e:
	# print >>sys.stderr, '{file}:{sent}:Exception: {ex}'.format(file=base_file_path,
	# sent=i,
	# ex=e)
	# sys.exit(1)
	raise