Skip to content

Instantly share code, notes, and snippets.

@Katsumata420
Last active July 1, 2020 01:31
Show Gist options
  • Save Katsumata420/a033da437df41e1d804ab9f1d1efb989 to your computer and use it in GitHub Desktop.
Save Katsumata420/a033da437df41e1d804ab9f1d1efb989 to your computer and use it in GitHub Desktop.
A python script to convert annotated data in standoff format (brat format) into the BIO1 format (conll2003 format) for NER training
import argparse
import os
import re
"""
convert brat2conll2003 (IOB1)
input:
input_text: brat text file; same basename + '.ann' is used as annotation file.
output_file: output file path converted to conll format
if not given, use input_text basename + '.conll03'
caution:
- This script convert only one file.
- only use NER tags; not use pos and chunks
POS: PS; Chunks: CH
- input text is tokenised (including period);
e.g. I am a student .
This script is based on https://gist.github.com/thatguysimon/6caa622be083f97b8c5c9a10478ba058.
"""
DEFAULT_ANNOTATION='O'
POS = 'PS'
CHUNK = 'B-CH'
def get_args():
args = argparse.ArgumentParser()
args.add_argument('--input_text', '-i', required=True, \
help='text file you want to convert \
(if /path/to/hoge.txt is selected, /path/to/hoge.ann is also used. )')
args.add_argument('--output_file', '-o', default='')
args = args.parse_args()
basename = os.path.splitext(os.path.basename(args.input_text))[0]
dirname = os.path.dirname(args.input_text)
args.ann_file = os.path.join(dirname, basename + '.ann')
if args.output_file == '':
args.output_file = os.path.join(dirname, basename + '.conll03')
return args
def get_annotation(file_path):
# brat format: T\tTAG start end\twords\n
# output annotation format: list of dict
## key: tag; value: NER_tag
## key: words; value: words
## key and value: start_position; key and value: end_position:
annotations = list()
with open(file_path) as i_f:
for line in i_f:
current_items = dict()
line = line.strip().split('\t')
assert len(line) == 3
idx, tag_positions, words = line
tag_positions = tag_positions.split()
assert len(tag_positions) == 3
tag, start, end = tag_positions
current_items['tag'] = tag
current_items['words'] = words
current_items['start_position'] = int(start)
current_items['end_position'] = int(end)
annotations.append(current_items)
return annotations
def convert(args, annotations):
offset_sentence = 0
with open(args.input_text) as i_f, open(args.output_file, 'w') as o_f:
for line in i_f:
line = line.strip()
current_line_offset = 0
for token in line.split():
start_p = current_line_offset + offset_sentence
end_p = start_p + len(token)
current_line_offset = end_p + 1 # for space
entity_found = False
ner_anno = DEFAULT_ANNOTATION
for annotation in annotations:
if start_p == annotation['start_position'] and \
end_p <= annotation['end_position']:
ner_anno = 'B-' + annotation['tag']
entity_found = True
break
elif start_p > annotation['start_position'] and \
end_p <= annotation['end_position']:
ner_anno = 'I-' + annotation['tag']
entity_found = True
break
output_seq = '{} {} {} {}\n'.format(token, POS, CHUNK, ner_anno)
o_f.write(output_seq)
o_f.write('\n') # empty line
offset_sentence += len(line) + 1
def main():
args = get_args()
annotations = get_annotation(args.ann_file)
convert(args, annotations)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment