Last active
January 27, 2021 16:28
-
-
Save Wapiti08/2d86de5fa458dfba466a58b8d800735f to your computer and use it in GitHub Desktop.
Give a csv with token, label ast the columns, output with the token, biluo label as the columns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Examples in original_csv: | |
APT,Sharpshooter | |
APT,Sandworm Team | |
APT,Blue Mockingbird | |
APT,Playful Dragon | |
techniques,Compromise Software Supply Chain | |
techniques,Supply Chain Compromise | |
... | |
for fuzzy matching, you can use spaczz in spacy. | |
''' | |
from itertools import groupby | |
import operator | |
def label_type(index, length): | |
if index == 0: | |
return "B" | |
elif index == length - 1: | |
return "L" | |
else: | |
return "I" | |
def biluo_matching(iob_list, ent_type_list): | |
biluo_tags = [] | |
# there are '' in this group | |
type_indices = [[e[0] for e in d[1]] for d in groupby(enumerate(ent_type_list), key=operator.itemgetter(1))] | |
for index_group in type_indices: | |
length = len(index_group) | |
if length == 1: | |
# assign O or U | |
index = index_group[0] | |
if ent_type_list[index] !='': | |
biluo_tags.append(('U', ent_type_list[index])) | |
else: | |
biluo_tags.append(('O', ent_type_list[index])) | |
else: | |
biluo_tags.extend([(label_type(index_group.index(index), length), ent_type_list[index], ) for index in index_group]) | |
return biluo_tags |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment