LLCampos/convert_noble_coder_annots_to_webanno_tsv.py

## convert_noble_coder_annots_to_webanno_tsv.py
# coding: utf-8

import os
import re
import nltk.data


"""Converts annotations output from Noble Coder (http://noble-tools.dbmi.pitt.edu/) to
WebAnno TSV 2 format (https://webanno.github.io/webanno/releases/3.0.1/docs/user-guide.html#sect_webannotsv)

Assumptions:
- Noble annotations have no overlaps
- The script is run in a location with 3 folders:
   * 'text/' the input o Noble Coder;
   * 'annot_noble/' the output of Noble Coer;
   * 'annot_webanno_tsv/' where the results of the script will be written
"""


def get_file_names():
    """Return a list of the prefixes of the names of the files the were
    annotated."""
    file_list = os.listdir('text')

    # Ignore juptyter file
    filtered_file_list = filter(lambda name: name != '.ipynb_checkpoints', file_list)

    # Ignore the '.txt' from the filename
    prefixes = map(lambda name: name.split('.')[0], filtered_file_list)

    return prefixes


def convert_to_indexes(noble_annot_index):
    """Convert the value of the column Annotation of the Noble TSV annotation file to just
    'from' and 'to' values.

    Ex:
    >>> convert_to_indexes('right/597, temporal/603, lobe/612')
    (597, 616)"""

    words = map(lambda word: word.strip().split('/'), noble_annot_index.split(','))

    word_from = min(words, key=lambda x: int(x[1]))
    word_to = max(words, key=lambda x: int(x[1]))

    from_value = int(word_from[1])
    to_value = int(word_to[1]) + len(word_to[0])

    return from_value, to_value


def get_noble_annotations():
    """Return dictionary with annotation indexes for each file."""

    with open('annot_noble/RESULTS.tsv') as f:
        tsv_lines = map(lambda line: line.strip().split('\t'), f.readlines()[1:])

    name_index = 0
    indexes_annot_index = 5

    noble_annots = {}
    for annot in tsv_lines:
        name_prefix = annot[name_index].split('.')[0]

        if name_prefix not in noble_annots.keys():
            noble_annots[name_prefix] = []

        indexes = {}

        from_value, to_value = convert_to_indexes(annot[indexes_annot_index])
        indexes['from'], indexes['to'] = from_value, to_value

        noble_annots[name_prefix].append(indexes)

    return noble_annots


def split_span(s):
    tokens = []
    for match in re.finditer(r"[\w\d]+|[^\w\d\s]+|\s+", s):
        span = match.span()
        tokens.append((match.group(0), span[0], span[1]))
    return tokens


def tokenize_by_sentences(text):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(text)


def get_fullstops_indexes(text):
    fullstops_indexes = []
    for match in re.finditer(r'(\.)\s', text, flags=re.UNICODE):
        fullstops_indexes.append(match.span()[0] + 1)
    return fullstops_indexes


def convert(text, annots, entity_type):
    """Returns annotations in Webannot TSV 2 format.

    'text' is the text that was annotated
    'annots' are the annotations of 'text' in the format of a value of the dict
    output by get_noble_annotations()
    'entity_type' is the type of Named-Entity that was annotated by Noble. Ex: genes.
    """

    # Tokenize text
    splitted_text = split_span(text)

    for annot in annots:
        for j, token in enumerate(splitted_text):
            token_from = token[1]
            token_to = token[2]

            if token_from == annot[u'from']:
                splitted_text[j] += (u'B-{}'.format(entity_type),)
            elif token_from > annot[u'from'] and token_to <= annot[u'to']:
                splitted_text[j] += (u'I-{}'.format(entity_type),)

    for j in range(len(splitted_text)):
        if len(splitted_text[j]) == 3:
            splitted_text[j] += (u'O',)

    splitted_text_current_line = 0
    sentences = []

    fullstops_indexes = get_fullstops_indexes(text)
    sentence_tokenized_text = map(lambda x: x + '.', re.split(r'\.\s', text, flags=re.UNICODE))

    for sentence_number, sentence_text in enumerate(sentence_tokenized_text):
        sentence_annotations = []
        line_number = 0
        for token in splitted_text[splitted_text_current_line:]:
            if token[0].strip() not in [u'']:
                sentence_annotations.append(u'{}-{}\t{}\t{}'.format(sentence_number + 1, line_number + 1, token[0], u'|'.join(token[3:])))
                line_number += 1
            splitted_text_current_line += 1

            if token[2] in fullstops_indexes:
                break

        sentence_header = u"#id={}\n".format(sentence_number + 1)
        sentence_text = u"#text={}\n".format(sentence_text.replace('\n', ' '))
        sentence_annotations = u'\n'.join(sentence_annotations) + '\n'
        sentence = sentence_header + sentence_text + sentence_annotations
        sentences.append(sentence)

    webannot_header = u" # de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity | value\n"

    return webannot_header + '\n'.join(sentences)


def convert_all(entity_type):
    """Converts all the annotatios in the RESULTS.tsv file resulting from Noble Coder in many
    WebAnno TSV files, one corresponding for each document annotated.

    entity_type is a string representing the name of the type of named-entity annotated by Noble. Ex: gene"""

    file_names = get_file_names()
    noble_annotations = get_noble_annotations()

    for file_name in file_names:

        with open('text/{}.txt'.format(file_name)) as f:
            text = f.read().decode('utf-8')

        annots = noble_annotations[file_name]

        webannot_tsv_str = convert(text, annots, entity_type)

        with open('annot_webanno_tsv/{}.tsv'.format(file_name), 'w') as f:
            f.write(webannot_tsv_str.encode('utf-8'))


if __name__ == '__main__':
    convert_all('RADLEX')
	# coding: utf-8

	import os
	import re
	import nltk.data


	"""Converts annotations output from Noble Coder (http://noble-tools.dbmi.pitt.edu/) to
	WebAnno TSV 2 format (https://webanno.github.io/webanno/releases/3.0.1/docs/user-guide.html#sect_webannotsv)

	Assumptions:
	- Noble annotations have no overlaps
	- The script is run in a location with 3 folders:
	* 'text/' the input o Noble Coder;
	* 'annot_noble/' the output of Noble Coer;
	* 'annot_webanno_tsv/' where the results of the script will be written
	"""


	def get_file_names():
	"""Return a list of the prefixes of the names of the files the were
	annotated."""
	file_list = os.listdir('text')

	# Ignore juptyter file
	filtered_file_list = filter(lambda name: name != '.ipynb_checkpoints', file_list)

	# Ignore the '.txt' from the filename
	prefixes = map(lambda name: name.split('.')[0], filtered_file_list)

	return prefixes


	def convert_to_indexes(noble_annot_index):
	"""Convert the value of the column Annotation of the Noble TSV annotation file to just
	'from' and 'to' values.

	Ex:
	>>> convert_to_indexes('right/597, temporal/603, lobe/612')
	(597, 616)"""

	words = map(lambda word: word.strip().split('/'), noble_annot_index.split(','))

	word_from = min(words, key=lambda x: int(x[1]))
	word_to = max(words, key=lambda x: int(x[1]))

	from_value = int(word_from[1])
	to_value = int(word_to[1]) + len(word_to[0])

	return from_value, to_value


	def get_noble_annotations():
	"""Return dictionary with annotation indexes for each file."""

	with open('annot_noble/RESULTS.tsv') as f:
	tsv_lines = map(lambda line: line.strip().split('\t'), f.readlines()[1:])

	name_index = 0
	indexes_annot_index = 5

	noble_annots = {}
	for annot in tsv_lines:
	name_prefix = annot[name_index].split('.')[0]

	if name_prefix not in noble_annots.keys():
	noble_annots[name_prefix] = []

	indexes = {}

	from_value, to_value = convert_to_indexes(annot[indexes_annot_index])
	indexes['from'], indexes['to'] = from_value, to_value

	noble_annots[name_prefix].append(indexes)

	return noble_annots


	def split_span(s):
	tokens = []
	for match in re.finditer(r"[\w\d]+\|[^\w\d\s]+\|\s+", s):
	span = match.span()
	tokens.append((match.group(0), span[0], span[1]))
	return tokens


	def tokenize_by_sentences(text):
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	return tokenizer.tokenize(text)


	def get_fullstops_indexes(text):
	fullstops_indexes = []
	for match in re.finditer(r'(\.)\s', text, flags=re.UNICODE):
	fullstops_indexes.append(match.span()[0] + 1)
	return fullstops_indexes


	def convert(text, annots, entity_type):
	"""Returns annotations in Webannot TSV 2 format.

	'text' is the text that was annotated
	'annots' are the annotations of 'text' in the format of a value of the dict
	output by get_noble_annotations()
	'entity_type' is the type of Named-Entity that was annotated by Noble. Ex: genes.
	"""

	# Tokenize text
	splitted_text = split_span(text)

	for annot in annots:
	for j, token in enumerate(splitted_text):
	token_from = token[1]
	token_to = token[2]

	if token_from == annot[u'from']:
	splitted_text[j] += (u'B-{}'.format(entity_type),)
	elif token_from > annot[u'from'] and token_to <= annot[u'to']:
	splitted_text[j] += (u'I-{}'.format(entity_type),)

	for j in range(len(splitted_text)):
	if len(splitted_text[j]) == 3:
	splitted_text[j] += (u'O',)

	splitted_text_current_line = 0
	sentences = []

	fullstops_indexes = get_fullstops_indexes(text)
	sentence_tokenized_text = map(lambda x: x + '.', re.split(r'\.\s', text, flags=re.UNICODE))

	for sentence_number, sentence_text in enumerate(sentence_tokenized_text):
	sentence_annotations = []
	line_number = 0
	for token in splitted_text[splitted_text_current_line:]:
	if token[0].strip() not in [u'']:
	sentence_annotations.append(u'{}-{}\t{}\t{}'.format(sentence_number + 1, line_number + 1, token[0], u'\|'.join(token[3:])))
	line_number += 1
	splitted_text_current_line += 1

	if token[2] in fullstops_indexes:
	break

	sentence_header = u"#id={}\n".format(sentence_number + 1)
	sentence_text = u"#text={}\n".format(sentence_text.replace('\n', ' '))
	sentence_annotations = u'\n'.join(sentence_annotations) + '\n'
	sentence = sentence_header + sentence_text + sentence_annotations
	sentences.append(sentence)

	webannot_header = u" # de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity \| value\n"

	return webannot_header + '\n'.join(sentences)


	def convert_all(entity_type):
	"""Converts all the annotatios in the RESULTS.tsv file resulting from Noble Coder in many
	WebAnno TSV files, one corresponding for each document annotated.

	entity_type is a string representing the name of the type of named-entity annotated by Noble. Ex: gene"""

	file_names = get_file_names()
	noble_annotations = get_noble_annotations()

	for file_name in file_names:

	with open('text/{}.txt'.format(file_name)) as f:
	text = f.read().decode('utf-8')

	annots = noble_annotations[file_name]

	webannot_tsv_str = convert(text, annots, entity_type)

	with open('annot_webanno_tsv/{}.tsv'.format(file_name), 'w') as f:
	f.write(webannot_tsv_str.encode('utf-8'))


	if __name__ == '__main__':
	convert_all('RADLEX')