Skip to content

Instantly share code, notes, and snippets.

Last active July 1, 2020 01:31
Show Gist options
  • Save Katsumata420/a033da437df41e1d804ab9f1d1efb989 to your computer and use it in GitHub Desktop.
Save Katsumata420/a033da437df41e1d804ab9f1d1efb989 to your computer and use it in GitHub Desktop.
A python script to convert annotated data in standoff format (brat format) into the BIO1 format (conll2003 format) for NER training
import argparse
import os
import re
convert brat2conll2003 (IOB1)
input_text: brat text file; same basename + '.ann' is used as annotation file.
output_file: output file path converted to conll format
if not given, use input_text basename + '.conll03'
- This script convert only one file.
- only use NER tags; not use pos and chunks
POS: PS; Chunks: CH
- input text is tokenised (including period);
e.g. I am a student .
This script is based on
POS = 'PS'
def get_args():
args = argparse.ArgumentParser()
args.add_argument('--input_text', '-i', required=True, \
help='text file you want to convert \
(if /path/to/hoge.txt is selected, /path/to/hoge.ann is also used. )')
args.add_argument('--output_file', '-o', default='')
args = args.parse_args()
basename = os.path.splitext(os.path.basename(args.input_text))[0]
dirname = os.path.dirname(args.input_text)
args.ann_file = os.path.join(dirname, basename + '.ann')
if args.output_file == '':
args.output_file = os.path.join(dirname, basename + '.conll03')
return args
def get_annotation(file_path):
# brat format: T\tTAG start end\twords\n
# output annotation format: list of dict
## key: tag; value: NER_tag
## key: words; value: words
## key and value: start_position; key and value: end_position:
annotations = list()
with open(file_path) as i_f:
for line in i_f:
current_items = dict()
line = line.strip().split('\t')
assert len(line) == 3
idx, tag_positions, words = line
tag_positions = tag_positions.split()
assert len(tag_positions) == 3
tag, start, end = tag_positions
current_items['tag'] = tag
current_items['words'] = words
current_items['start_position'] = int(start)
current_items['end_position'] = int(end)
return annotations
def convert(args, annotations):
offset_sentence = 0
with open(args.input_text) as i_f, open(args.output_file, 'w') as o_f:
for line in i_f:
line = line.strip()
current_line_offset = 0
for token in line.split():
start_p = current_line_offset + offset_sentence
end_p = start_p + len(token)
current_line_offset = end_p + 1 # for space
entity_found = False
for annotation in annotations:
if start_p == annotation['start_position'] and \
end_p <= annotation['end_position']:
ner_anno = 'B-' + annotation['tag']
entity_found = True
elif start_p > annotation['start_position'] and \
end_p <= annotation['end_position']:
ner_anno = 'I-' + annotation['tag']
entity_found = True
output_seq = '{} {} {} {}\n'.format(token, POS, CHUNK, ner_anno)
o_f.write('\n') # empty line
offset_sentence += len(line) + 1
def main():
args = get_args()
annotations = get_annotation(args.ann_file)
convert(args, annotations)
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment