Katsumata420/brat2conll2003.py

## brat2conll2003.py
import argparse
import os
import re

"""
convert brat2conll2003 (IOB1)
input:
  input_text: brat text file; same basename + '.ann' is used as annotation file.
  output_file: output file path converted to conll format
               if not given, use input_text basename + '.conll03'

caution:
- This script convert only one file.
- only use NER tags; not use pos and chunks
    POS: PS; Chunks: CH
- input text is tokenised (including period);
    e.g. I am a student .

This script is based on https://gist.github.com/thatguysimon/6caa622be083f97b8c5c9a10478ba058.
"""

DEFAULT_ANNOTATION='O'
POS = 'PS'
CHUNK = 'B-CH'

def get_args():
    args = argparse.ArgumentParser()

    args.add_argument('--input_text', '-i', required=True, \
        help='text file you want to convert \
        (if /path/to/hoge.txt is selected, /path/to/hoge.ann is also used. )')
    args.add_argument('--output_file', '-o', default='')
    args = args.parse_args()

    basename = os.path.splitext(os.path.basename(args.input_text))[0]
    dirname = os.path.dirname(args.input_text)
    args.ann_file = os.path.join(dirname, basename + '.ann')
    if args.output_file == '':
        args.output_file = os.path.join(dirname, basename + '.conll03')

    return args

def get_annotation(file_path):
    # brat format: T\tTAG start end\twords\n
    # output annotation format: list of dict
    ## key: tag; value: NER_tag
    ## key: words; value: words
    ## key and value: start_position; key and value: end_position:

    annotations = list()
    with open(file_path) as i_f:
        for line in i_f:
            current_items = dict()
            line = line.strip().split('\t')
            assert len(line) == 3
            idx, tag_positions, words = line

            tag_positions = tag_positions.split()
            assert len(tag_positions) == 3
            tag, start, end = tag_positions

            current_items['tag'] = tag
            current_items['words'] = words
            current_items['start_position'] = int(start)
            current_items['end_position'] = int(end)

            annotations.append(current_items)
    return annotations

def convert(args, annotations):
    offset_sentence = 0
    with open(args.input_text) as i_f, open(args.output_file, 'w') as o_f:
        for line in i_f:
            line = line.strip()

            current_line_offset = 0
            for token in line.split():
                start_p = current_line_offset + offset_sentence
                end_p = start_p + len(token)

                current_line_offset = end_p + 1 # for space

                entity_found = False
                ner_anno = DEFAULT_ANNOTATION
                for annotation in annotations:
                    if start_p == annotation['start_position'] and \
                            end_p <= annotation['end_position']:
                        ner_anno = 'B-' + annotation['tag']
                        entity_found = True
                        break
                    elif start_p > annotation['start_position'] and \
                            end_p <= annotation['end_position']:
                        ner_anno = 'I-' + annotation['tag']
                        entity_found = True
                        break

                output_seq = '{} {} {} {}\n'.format(token, POS, CHUNK, ner_anno)
                o_f.write(output_seq)

            o_f.write('\n') # empty line
            offset_sentence += len(line) + 1

def main():
    args = get_args()

    annotations = get_annotation(args.ann_file)
    convert(args, annotations)


if __name__ == '__main__':
    main()
	import argparse
	import os
	import re

	"""
	convert brat2conll2003 (IOB1)
	input:
	input_text: brat text file; same basename + '.ann' is used as annotation file.
	output_file: output file path converted to conll format
	if not given, use input_text basename + '.conll03'

	caution:
	- This script convert only one file.
	- only use NER tags; not use pos and chunks
	POS: PS; Chunks: CH
	- input text is tokenised (including period);
	e.g. I am a student .

	This script is based on https://gist.github.com/thatguysimon/6caa622be083f97b8c5c9a10478ba058.
	"""

	DEFAULT_ANNOTATION='O'
	POS = 'PS'
	CHUNK = 'B-CH'

	def get_args():
	args = argparse.ArgumentParser()

	args.add_argument('--input_text', '-i', required=True, \
	help='text file you want to convert \
	(if /path/to/hoge.txt is selected, /path/to/hoge.ann is also used. )')
	args.add_argument('--output_file', '-o', default='')
	args = args.parse_args()

	basename = os.path.splitext(os.path.basename(args.input_text))[0]
	dirname = os.path.dirname(args.input_text)
	args.ann_file = os.path.join(dirname, basename + '.ann')
	if args.output_file == '':
	args.output_file = os.path.join(dirname, basename + '.conll03')

	return args

	def get_annotation(file_path):
	# brat format: T\tTAG start end\twords\n
	# output annotation format: list of dict
	## key: tag; value: NER_tag
	## key: words; value: words
	## key and value: start_position; key and value: end_position:

	annotations = list()
	with open(file_path) as i_f:
	for line in i_f:
	current_items = dict()
	line = line.strip().split('\t')
	assert len(line) == 3
	idx, tag_positions, words = line

	tag_positions = tag_positions.split()
	assert len(tag_positions) == 3
	tag, start, end = tag_positions

	current_items['tag'] = tag
	current_items['words'] = words
	current_items['start_position'] = int(start)
	current_items['end_position'] = int(end)

	annotations.append(current_items)
	return annotations

	def convert(args, annotations):
	offset_sentence = 0
	with open(args.input_text) as i_f, open(args.output_file, 'w') as o_f:
	for line in i_f:
	line = line.strip()

	current_line_offset = 0
	for token in line.split():
	start_p = current_line_offset + offset_sentence
	end_p = start_p + len(token)

	current_line_offset = end_p + 1 # for space

	entity_found = False
	ner_anno = DEFAULT_ANNOTATION
	for annotation in annotations:
	if start_p == annotation['start_position'] and \
	end_p <= annotation['end_position']:
	ner_anno = 'B-' + annotation['tag']
	entity_found = True
	break
	elif start_p > annotation['start_position'] and \
	end_p <= annotation['end_position']:
	ner_anno = 'I-' + annotation['tag']
	entity_found = True
	break

	output_seq = '{} {} {} {}\n'.format(token, POS, CHUNK, ner_anno)
	o_f.write(output_seq)

	o_f.write('\n') # empty line
	offset_sentence += len(line) + 1

	def main():
	args = get_args()

	annotations = get_annotation(args.ann_file)
	convert(args, annotations)


	if __name__ == '__main__':
	main()