Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
A python script to convert annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models
# A python script to turn annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models
# - NER format based on:
# - RE format based on:
# Usage:
# 1) Install the pycorenlp package
# 2) Run CoreNLP server (change CORENLP_SERVER_ADDRESS if needed)
# 3) Place .ann and .txt files from brat in the location specified in DATA_DIRECTORY
# 4) Run this script
# Cross-sentence annotation is not supported
from pycorenlp import StanfordCoreNLP
import os
from os import listdir
from os.path import isfile, join
CORENLP_SERVER_ADDRESS = 'http://localhost:9000'
NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-training-data.tsv')
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp')
if os.path.exists(OUTPUT_DIRECTORY):
if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH):
sentence_count = 0
# looping through .ann files in the data directory
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann']
for file in ann_data_files:
entities = []
relations = []
# process .ann file - place entities and relations into 2 seperate lists of tuples
with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file:
lines = document_anno_file.readlines()
for line in lines:
standoff_line = line.split()
if standoff_line[0][0] == STANDOFF_ENTITY_PREFIX:
entity = {}
entity['standoff_id'] = int(standoff_line[0][1:])
entity['entity_type'] = standoff_line[1].capitalize()
entity['offset_start'] = int(standoff_line[2])
entity['offset_end'] = int(standoff_line[3])
entity['word'] = standoff_line[4]
elif standoff_line[0][0] == STANDOFF_RELATION_PREFIX:
relation = {}
relation['standoff_id'] = int(standoff_line[0][1:])
relation['name'] = standoff_line[1]
relation['standoff_entity1_id'] = int(standoff_line[2].split(':')[1][1:])
relation['standoff_entity2_id'] = int(standoff_line[3].split(':')[1][1:])
# relations.append((standoff_id, relation_name, standoff_entity1_id, standoff_entity2_id))
# read the .ann's matching .txt file and tokenize its text using stanford corenlp
with open(join(DATA_DIRECTORY, file.replace('.ann', '.txt')), 'r') as document_text_file:
document_text =
output = nlp.annotate(document_text, properties={
'annotators': 'tokenize,ssplit,pos',
'outputFormat': 'json'
# write text and annotations into NER and RE output files
with open(NER_TRAINING_DATA_OUTPUT_PATH, 'a') as ner_training_data, open(RE_TRAINING_DATA_OUTPUT_PATH, 'a') as re_training_data:
for sentence in output['sentences']:
entities_in_sentence = {}
sentence_re_rows = []
for token in sentence['tokens']:
offset_start = int(token['characterOffsetBegin'])
offset_end = int(token['characterOffsetEnd'])
re_row = {}
entity_found = False
# searching for token in annotated entities
for entity in entities:
if offset_start >= entity['offset_start'] and offset_end <= entity['offset_end']:
ner_anno = entity['entity_type']
# multi-token entities for RE need to be handled differently than NER
if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']:
entities_in_sentence[entity['standoff_id']] = len(sentence_re_rows)
re_row['entity_type'] = entity['entity_type']
re_row['pos_tag'] = token['pos']
re_row['word'] = token['word']
entity_found = True
elif offset_start > entity['offset_start'] and offset_end <= entity['offset_end'] and len(sentence_re_rows) > 0:
sentence_re_rows[-1]['pos_tag'] += '/{}'.format(token['pos'])
sentence_re_rows[-1]['word'] += '/{}'.format(token['word'])
entity_found = True
if not entity_found:
re_row['entity_type'] = DEFAULT_OTHER_ANNO
re_row['pos_tag'] = token['pos']
re_row['word'] = token['word']
# writing tagged tokens to NER training data
ner_training_data.write('{}\t{}\n'.format(token['word'], ner_anno))
# writing tagged tokens to RE training data
token_count = 0
for sentence_row in sentence_re_rows:
re_training_data.write('{}\t{}\t{}\tO\t{}\t{}\tO\tO\tO\n'.format(str(sentence_count), sentence_row['entity_type'], str(token_count), sentence_row['pos_tag'], sentence_row['word']))
token_count += 1
# writing relations to RE training data
for relation in relations:
if relation['standoff_entity1_id'] in entities_in_sentence and relation['standoff_entity2_id'] in entities_in_sentence:
entity1 = str(entities_in_sentence[relation['standoff_entity1_id']])
entity2 = str(entities_in_sentence[relation['standoff_entity2_id']])
relation_name = relation['name']
re_training_data.write('{}\t{}\t{}\n'.format(entity1, entity2, relation_name))
sentence_count += 1
print('Processed file pair: {} and {}'.format(file, file.replace('.ann', '.txt')))
Copy link

GurkiratSarna commented Aug 14, 2020

Hello, I am trying your script but I am facing issues with the corenlp server. I downloaded the folder from, started the server in my command prompt, imported, and installed the corenlp package in google colab, however it fails with 'Exception: Check whether you have started the CoreNLP server'. So I logged onto the localhost:9000 server and tried getting the output on the server for a random single sentence. The server doesn't respond and my command prompt shows the error of ' java.lang.OutOfMemoryError: Java heap space'. While starting the server I used the command 'java -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,parse \ -port 9000 -timeout 75000'.

I am aware this is not directly related to NLP, however, I would like to try your code for my project and would like any help that you can offer for this. Thank you.

Copy link

thatguysimon commented Aug 16, 2020

Hey Gurikat, try adding a question on StackOverflow with tags like stanford-nlp, core-nlp, etc.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment