Created
February 9, 2017 16:12
-
-
Save LLCampos/5f1680941984c4b63f986965e7384e6c to your computer and use it in GitHub Desktop.
Converts annotations output from Noble Coder to WebAnno TSV 2 format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
import re | |
import nltk.data | |
"""Converts annotations output from Noble Coder (http://noble-tools.dbmi.pitt.edu/) to | |
WebAnno TSV 2 format (https://webanno.github.io/webanno/releases/3.0.1/docs/user-guide.html#sect_webannotsv) | |
Assumptions: | |
- Noble annotations have no overlaps | |
- The script is run in a location with 3 folders: | |
* 'text/' the input o Noble Coder; | |
* 'annot_noble/' the output of Noble Coer; | |
* 'annot_webanno_tsv/' where the results of the script will be written | |
""" | |
def get_file_names(): | |
"""Return a list of the prefixes of the names of the files the were | |
annotated.""" | |
file_list = os.listdir('text') | |
# Ignore juptyter file | |
filtered_file_list = filter(lambda name: name != '.ipynb_checkpoints', file_list) | |
# Ignore the '.txt' from the filename | |
prefixes = map(lambda name: name.split('.')[0], filtered_file_list) | |
return prefixes | |
def convert_to_indexes(noble_annot_index): | |
"""Convert the value of the column Annotation of the Noble TSV annotation file to just | |
'from' and 'to' values. | |
Ex: | |
>>> convert_to_indexes('right/597, temporal/603, lobe/612') | |
(597, 616)""" | |
words = map(lambda word: word.strip().split('/'), noble_annot_index.split(',')) | |
word_from = min(words, key=lambda x: int(x[1])) | |
word_to = max(words, key=lambda x: int(x[1])) | |
from_value = int(word_from[1]) | |
to_value = int(word_to[1]) + len(word_to[0]) | |
return from_value, to_value | |
def get_noble_annotations(): | |
"""Return dictionary with annotation indexes for each file.""" | |
with open('annot_noble/RESULTS.tsv') as f: | |
tsv_lines = map(lambda line: line.strip().split('\t'), f.readlines()[1:]) | |
name_index = 0 | |
indexes_annot_index = 5 | |
noble_annots = {} | |
for annot in tsv_lines: | |
name_prefix = annot[name_index].split('.')[0] | |
if name_prefix not in noble_annots.keys(): | |
noble_annots[name_prefix] = [] | |
indexes = {} | |
from_value, to_value = convert_to_indexes(annot[indexes_annot_index]) | |
indexes['from'], indexes['to'] = from_value, to_value | |
noble_annots[name_prefix].append(indexes) | |
return noble_annots | |
def split_span(s): | |
tokens = [] | |
for match in re.finditer(r"[\w\d]+|[^\w\d\s]+|\s+", s): | |
span = match.span() | |
tokens.append((match.group(0), span[0], span[1])) | |
return tokens | |
def tokenize_by_sentences(text): | |
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
return tokenizer.tokenize(text) | |
def get_fullstops_indexes(text): | |
fullstops_indexes = [] | |
for match in re.finditer(r'(\.)\s', text, flags=re.UNICODE): | |
fullstops_indexes.append(match.span()[0] + 1) | |
return fullstops_indexes | |
def convert(text, annots, entity_type): | |
"""Returns annotations in Webannot TSV 2 format. | |
'text' is the text that was annotated | |
'annots' are the annotations of 'text' in the format of a value of the dict | |
output by get_noble_annotations() | |
'entity_type' is the type of Named-Entity that was annotated by Noble. Ex: genes. | |
""" | |
# Tokenize text | |
splitted_text = split_span(text) | |
for annot in annots: | |
for j, token in enumerate(splitted_text): | |
token_from = token[1] | |
token_to = token[2] | |
if token_from == annot[u'from']: | |
splitted_text[j] += (u'B-{}'.format(entity_type),) | |
elif token_from > annot[u'from'] and token_to <= annot[u'to']: | |
splitted_text[j] += (u'I-{}'.format(entity_type),) | |
for j in range(len(splitted_text)): | |
if len(splitted_text[j]) == 3: | |
splitted_text[j] += (u'O',) | |
splitted_text_current_line = 0 | |
sentences = [] | |
fullstops_indexes = get_fullstops_indexes(text) | |
sentence_tokenized_text = map(lambda x: x + '.', re.split(r'\.\s', text, flags=re.UNICODE)) | |
for sentence_number, sentence_text in enumerate(sentence_tokenized_text): | |
sentence_annotations = [] | |
line_number = 0 | |
for token in splitted_text[splitted_text_current_line:]: | |
if token[0].strip() not in [u'']: | |
sentence_annotations.append(u'{}-{}\t{}\t{}'.format(sentence_number + 1, line_number + 1, token[0], u'|'.join(token[3:]))) | |
line_number += 1 | |
splitted_text_current_line += 1 | |
if token[2] in fullstops_indexes: | |
break | |
sentence_header = u"#id={}\n".format(sentence_number + 1) | |
sentence_text = u"#text={}\n".format(sentence_text.replace('\n', ' ')) | |
sentence_annotations = u'\n'.join(sentence_annotations) + '\n' | |
sentence = sentence_header + sentence_text + sentence_annotations | |
sentences.append(sentence) | |
webannot_header = u" # de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity | value\n" | |
return webannot_header + '\n'.join(sentences) | |
def convert_all(entity_type): | |
"""Converts all the annotatios in the RESULTS.tsv file resulting from Noble Coder in many | |
WebAnno TSV files, one corresponding for each document annotated. | |
entity_type is a string representing the name of the type of named-entity annotated by Noble. Ex: gene""" | |
file_names = get_file_names() | |
noble_annotations = get_noble_annotations() | |
for file_name in file_names: | |
with open('text/{}.txt'.format(file_name)) as f: | |
text = f.read().decode('utf-8') | |
annots = noble_annotations[file_name] | |
webannot_tsv_str = convert(text, annots, entity_type) | |
with open('annot_webanno_tsv/{}.tsv'.format(file_name), 'w') as f: | |
f.write(webannot_tsv_str.encode('utf-8')) | |
if __name__ == '__main__': | |
convert_all('RADLEX') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment