Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save LLCampos/5f1680941984c4b63f986965e7384e6c to your computer and use it in GitHub Desktop.
Save LLCampos/5f1680941984c4b63f986965e7384e6c to your computer and use it in GitHub Desktop.
Converts annotations output from Noble Coder to WebAnno TSV 2 format
# coding: utf-8
import os
import re
import nltk.data
"""Converts annotations output from Noble Coder (http://noble-tools.dbmi.pitt.edu/) to
WebAnno TSV 2 format (https://webanno.github.io/webanno/releases/3.0.1/docs/user-guide.html#sect_webannotsv)
Assumptions:
- Noble annotations have no overlaps
- The script is run in a location with 3 folders:
* 'text/' the input o Noble Coder;
* 'annot_noble/' the output of Noble Coer;
* 'annot_webanno_tsv/' where the results of the script will be written
"""
def get_file_names():
"""Return a list of the prefixes of the names of the files the were
annotated."""
file_list = os.listdir('text')
# Ignore juptyter file
filtered_file_list = filter(lambda name: name != '.ipynb_checkpoints', file_list)
# Ignore the '.txt' from the filename
prefixes = map(lambda name: name.split('.')[0], filtered_file_list)
return prefixes
def convert_to_indexes(noble_annot_index):
"""Convert the value of the column Annotation of the Noble TSV annotation file to just
'from' and 'to' values.
Ex:
>>> convert_to_indexes('right/597, temporal/603, lobe/612')
(597, 616)"""
words = map(lambda word: word.strip().split('/'), noble_annot_index.split(','))
word_from = min(words, key=lambda x: int(x[1]))
word_to = max(words, key=lambda x: int(x[1]))
from_value = int(word_from[1])
to_value = int(word_to[1]) + len(word_to[0])
return from_value, to_value
def get_noble_annotations():
"""Return dictionary with annotation indexes for each file."""
with open('annot_noble/RESULTS.tsv') as f:
tsv_lines = map(lambda line: line.strip().split('\t'), f.readlines()[1:])
name_index = 0
indexes_annot_index = 5
noble_annots = {}
for annot in tsv_lines:
name_prefix = annot[name_index].split('.')[0]
if name_prefix not in noble_annots.keys():
noble_annots[name_prefix] = []
indexes = {}
from_value, to_value = convert_to_indexes(annot[indexes_annot_index])
indexes['from'], indexes['to'] = from_value, to_value
noble_annots[name_prefix].append(indexes)
return noble_annots
def split_span(s):
tokens = []
for match in re.finditer(r"[\w\d]+|[^\w\d\s]+|\s+", s):
span = match.span()
tokens.append((match.group(0), span[0], span[1]))
return tokens
def tokenize_by_sentences(text):
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
return tokenizer.tokenize(text)
def get_fullstops_indexes(text):
fullstops_indexes = []
for match in re.finditer(r'(\.)\s', text, flags=re.UNICODE):
fullstops_indexes.append(match.span()[0] + 1)
return fullstops_indexes
def convert(text, annots, entity_type):
"""Returns annotations in Webannot TSV 2 format.
'text' is the text that was annotated
'annots' are the annotations of 'text' in the format of a value of the dict
output by get_noble_annotations()
'entity_type' is the type of Named-Entity that was annotated by Noble. Ex: genes.
"""
# Tokenize text
splitted_text = split_span(text)
for annot in annots:
for j, token in enumerate(splitted_text):
token_from = token[1]
token_to = token[2]
if token_from == annot[u'from']:
splitted_text[j] += (u'B-{}'.format(entity_type),)
elif token_from > annot[u'from'] and token_to <= annot[u'to']:
splitted_text[j] += (u'I-{}'.format(entity_type),)
for j in range(len(splitted_text)):
if len(splitted_text[j]) == 3:
splitted_text[j] += (u'O',)
splitted_text_current_line = 0
sentences = []
fullstops_indexes = get_fullstops_indexes(text)
sentence_tokenized_text = map(lambda x: x + '.', re.split(r'\.\s', text, flags=re.UNICODE))
for sentence_number, sentence_text in enumerate(sentence_tokenized_text):
sentence_annotations = []
line_number = 0
for token in splitted_text[splitted_text_current_line:]:
if token[0].strip() not in [u'']:
sentence_annotations.append(u'{}-{}\t{}\t{}'.format(sentence_number + 1, line_number + 1, token[0], u'|'.join(token[3:])))
line_number += 1
splitted_text_current_line += 1
if token[2] in fullstops_indexes:
break
sentence_header = u"#id={}\n".format(sentence_number + 1)
sentence_text = u"#text={}\n".format(sentence_text.replace('\n', ' '))
sentence_annotations = u'\n'.join(sentence_annotations) + '\n'
sentence = sentence_header + sentence_text + sentence_annotations
sentences.append(sentence)
webannot_header = u" # de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity | value\n"
return webannot_header + '\n'.join(sentences)
def convert_all(entity_type):
"""Converts all the annotatios in the RESULTS.tsv file resulting from Noble Coder in many
WebAnno TSV files, one corresponding for each document annotated.
entity_type is a string representing the name of the type of named-entity annotated by Noble. Ex: gene"""
file_names = get_file_names()
noble_annotations = get_noble_annotations()
for file_name in file_names:
with open('text/{}.txt'.format(file_name)) as f:
text = f.read().decode('utf-8')
annots = noble_annotations[file_name]
webannot_tsv_str = convert(text, annots, entity_type)
with open('annot_webanno_tsv/{}.tsv'.format(file_name), 'w') as f:
f.write(webannot_tsv_str.encode('utf-8'))
if __name__ == '__main__':
convert_all('RADLEX')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment