frenzy2106/ner_f1.py

## ner_f1.py
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from copy import deepcopy

from collections import namedtuple

# Evaluation metric for Innoplexus NER Challenge

def collect_named_entities(tokens): # Helper Function for score calculation
    """
    Creates a list of Entity named-tuples, storing the entity type and the start and end
    offsets of the entity.

    :param tokens: a list of labels
    :return: a list of Entity named-tuples
    """
    Entity = namedtuple("Entity", "e_type start_offset end_offset")
    named_entities = []
    start_offset = None
    end_offset = None
    ent_type = None

    for offset, token_tag in enumerate(tokens):

        if token_tag == 'O':
            if ent_type is not None and start_offset is not None:
                end_offset = offset - 1
                named_entities.append(Entity(ent_type, start_offset, end_offset))
                start_offset = None
                end_offset = None
                ent_type = None

        elif ent_type is None:
            ent_type = token_tag[2:]
            start_offset = offset

        elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == 'B'):

            end_offset = offset - 1
            named_entities.append(Entity(ent_type, start_offset, end_offset))

            # start of a new entity
            ent_type = token_tag[2:]
            start_offset = offset
            end_offset = None

    # catches an entity that goes up until the last token
    if ent_type and start_offset and end_offset is None:
        named_entities.append(Entity(ent_type, start_offset, len(tokens)-1))

    return named_entities

def compute_metrics(true_named_entities, pred_named_entities): # Helper Function for score calculation
    eval_metrics = {'correct': 0, 'partial': 0, 'missed': 0, 'spurius': 0}
    target_tags_no_schema = ['indications']

    # overall results
    evaluation = {'partial': deepcopy(eval_metrics)}


    true_which_overlapped_with_pred = []  # keep track of entities that overlapped

    # go through each predicted named-entity
    for pred in pred_named_entities:
        found_overlap = False

        # check if there's an exact match, i.e.: boundary and entity type match
        if pred in true_named_entities:
            true_which_overlapped_with_pred.append(pred)
            evaluation['partial']['correct'] += 1

        else:

            # check for overlaps with any of the true entities
            for true in true_named_entities:


                # 2. check for an overlap i.e. not exact boundary match, with true entities
                if pred.start_offset <= true.end_offset and true.start_offset <= pred.end_offset:

                    true_which_overlapped_with_pred.append(true)

                    evaluation['partial']['partial'] += 1

                    found_overlap = True
                    break

            # count spurius (i.e., False Positive) entities
            if not found_overlap:
                # overall results
                evaluation['partial']['spurius'] += 1

    # count missed entities (i.e. False Negative)
    for true in true_named_entities:
        if true in true_which_overlapped_with_pred:
            continue
        else:
            # overall results
            evaluation['partial']['missed'] += 1

    # Compute 'possible', 'actual'
    for eval_type in ['partial']:

        correct = evaluation[eval_type]['correct']
        partial = evaluation[eval_type]['partial']
        missed = evaluation[eval_type]['missed']
        spurius = evaluation[eval_type]['spurius']

        # possible: nr. annotations in the gold-standard which contribute to the final score
        evaluation[eval_type]['possible'] = correct + partial + missed

        # actual: number of annotations produced by the NER system
        evaluation[eval_type]['actual'] = correct + partial + spurius

        actual = evaluation[eval_type]['actual']
        possible = evaluation[eval_type]['possible']

    return evaluation

def list_converter(df): # Helper Function for score calculation
    keys, values = df.sort_values('Sent_ID_x').values.T
    ukeys, index = np.unique(keys,True)
    lists = [list(array) for array in np.split(values,index[1:])]
    return lists

# ideal and pred respectively represent dataframes containing actual labels and predictions for the set of sentences in the test data.
# It has the same format as the sample submission (id, Sent_ID, tag)

def calculate_score(ideal, pred): # Calculates the final F1 Score

    merged = ideal.merge(pred, on = "id", how="inner").drop(['Sent_ID_y'],axis = 1)


    # The scores are calculated sentence wise and then aggregated to calculate the overall score, for this
    # List converter function groups the labels by sentence to generate a list of lists with each inner list representing a sentence in sequence
    ideal_ = list_converter(merged.drop(['id','tag_y'],axis = 1))
    pred_ = list_converter(merged.drop(['id','tag_x'],axis = 1))

    metrics_results = {'correct': 0, 'partial': 0,
                   'missed': 0, 'spurius': 0, 'possible': 0, 'actual': 0}

    results = {'partial': deepcopy(metrics_results)}


    for true_ents, pred_ents in zip(ideal_, pred_):
    # compute results for one sentence
        tmp_results = compute_metrics(collect_named_entities(true_ents),collect_named_entities(pred_ents))

    # aggregate overall results
        for eval_schema in results.keys():
            for metric in metrics_results.keys():
                results[eval_schema][metric] += tmp_results[eval_schema][metric]
    correct = results['partial']['correct']
    partial = results['partial']['partial']
    missed = results['partial']['missed']
    spurius = results['partial']['spurius']
    actual = results['partial']['actual']
    possible = results['partial']['possible']


    precision = (correct + 0.5 * partial) / actual if actual > 0 else 0
    recall = (correct + 0.5 * partial) / possible if possible > 0 else 0


    score = (2 * precision * recall)/(precision + recall) if (precision + recall) >0 else 0

    # final score
    return score
	import numpy as np
	import pandas as pd
	from sklearn.metrics import f1_score
	from copy import deepcopy

	from collections import namedtuple

	# Evaluation metric for Innoplexus NER Challenge

	def collect_named_entities(tokens): # Helper Function for score calculation
	"""
	Creates a list of Entity named-tuples, storing the entity type and the start and end
	offsets of the entity.

	:param tokens: a list of labels
	:return: a list of Entity named-tuples
	"""
	Entity = namedtuple("Entity", "e_type start_offset end_offset")
	named_entities = []
	start_offset = None
	end_offset = None
	ent_type = None

	for offset, token_tag in enumerate(tokens):

	if token_tag == 'O':
	if ent_type is not None and start_offset is not None:
	end_offset = offset - 1
	named_entities.append(Entity(ent_type, start_offset, end_offset))
	start_offset = None
	end_offset = None
	ent_type = None

	elif ent_type is None:
	ent_type = token_tag[2:]
	start_offset = offset

	elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == 'B'):

	end_offset = offset - 1
	named_entities.append(Entity(ent_type, start_offset, end_offset))

	# start of a new entity
	ent_type = token_tag[2:]
	start_offset = offset
	end_offset = None

	# catches an entity that goes up until the last token
	if ent_type and start_offset and end_offset is None:
	named_entities.append(Entity(ent_type, start_offset, len(tokens)-1))

	return named_entities

	def compute_metrics(true_named_entities, pred_named_entities): # Helper Function for score calculation
	eval_metrics = {'correct': 0, 'partial': 0, 'missed': 0, 'spurius': 0}
	target_tags_no_schema = ['indications']

	# overall results
	evaluation = {'partial': deepcopy(eval_metrics)}


	true_which_overlapped_with_pred = [] # keep track of entities that overlapped

	# go through each predicted named-entity
	for pred in pred_named_entities:
	found_overlap = False

	# check if there's an exact match, i.e.: boundary and entity type match
	if pred in true_named_entities:
	true_which_overlapped_with_pred.append(pred)
	evaluation['partial']['correct'] += 1

	else:

	# check for overlaps with any of the true entities
	for true in true_named_entities:


	# 2. check for an overlap i.e. not exact boundary match, with true entities
	if pred.start_offset <= true.end_offset and true.start_offset <= pred.end_offset:

	true_which_overlapped_with_pred.append(true)

	evaluation['partial']['partial'] += 1

	found_overlap = True
	break

	# count spurius (i.e., False Positive) entities
	if not found_overlap:
	# overall results
	evaluation['partial']['spurius'] += 1

	# count missed entities (i.e. False Negative)
	for true in true_named_entities:
	if true in true_which_overlapped_with_pred:
	continue
	else:
	# overall results
	evaluation['partial']['missed'] += 1

	# Compute 'possible', 'actual'
	for eval_type in ['partial']:

	correct = evaluation[eval_type]['correct']
	partial = evaluation[eval_type]['partial']
	missed = evaluation[eval_type]['missed']
	spurius = evaluation[eval_type]['spurius']

	# possible: nr. annotations in the gold-standard which contribute to the final score
	evaluation[eval_type]['possible'] = correct + partial + missed

	# actual: number of annotations produced by the NER system
	evaluation[eval_type]['actual'] = correct + partial + spurius

	actual = evaluation[eval_type]['actual']
	possible = evaluation[eval_type]['possible']

	return evaluation

	def list_converter(df): # Helper Function for score calculation
	keys, values = df.sort_values('Sent_ID_x').values.T
	ukeys, index = np.unique(keys,True)
	lists = [list(array) for array in np.split(values,index[1:])]
	return lists

	# ideal and pred respectively represent dataframes containing actual labels and predictions for the set of sentences in the test data.
	# It has the same format as the sample submission (id, Sent_ID, tag)

	def calculate_score(ideal, pred): # Calculates the final F1 Score

	merged = ideal.merge(pred, on = "id", how="inner").drop(['Sent_ID_y'],axis = 1)


	# The scores are calculated sentence wise and then aggregated to calculate the overall score, for this
	# List converter function groups the labels by sentence to generate a list of lists with each inner list representing a sentence in sequence
	ideal_ = list_converter(merged.drop(['id','tag_y'],axis = 1))
	pred_ = list_converter(merged.drop(['id','tag_x'],axis = 1))

	metrics_results = {'correct': 0, 'partial': 0,
	'missed': 0, 'spurius': 0, 'possible': 0, 'actual': 0}

	results = {'partial': deepcopy(metrics_results)}


	for true_ents, pred_ents in zip(ideal_, pred_):
	# compute results for one sentence
	tmp_results = compute_metrics(collect_named_entities(true_ents),collect_named_entities(pred_ents))

	# aggregate overall results
	for eval_schema in results.keys():
	for metric in metrics_results.keys():
	results[eval_schema][metric] += tmp_results[eval_schema][metric]
	correct = results['partial']['correct']
	partial = results['partial']['partial']
	missed = results['partial']['missed']
	spurius = results['partial']['spurius']
	actual = results['partial']['actual']
	possible = results['partial']['possible']


	precision = (correct + 0.5 * partial) / actual if actual > 0 else 0
	recall = (correct + 0.5 * partial) / possible if possible > 0 else 0


	score = (2 * precision * recall)/(precision + recall) if (precision + recall) >0 else 0

	# final score
	return score