Skip to content

Instantly share code, notes, and snippets.

@Wapiti08
Last active January 27, 2021 16:28
Show Gist options
  • Save Wapiti08/2d86de5fa458dfba466a58b8d800735f to your computer and use it in GitHub Desktop.
Save Wapiti08/2d86de5fa458dfba466a58b8d800735f to your computer and use it in GitHub Desktop.
Give a csv with token, label ast the columns, output with the token, biluo label as the columns
'''
Examples in original_csv:
APT,Sharpshooter
APT,Sandworm Team
APT,Blue Mockingbird
APT,Playful Dragon
techniques,Compromise Software Supply Chain
techniques,Supply Chain Compromise
...
for fuzzy matching, you can use spaczz in spacy.
'''
from itertools import groupby
import operator
def label_type(index, length):
if index == 0:
return "B"
elif index == length - 1:
return "L"
else:
return "I"
def biluo_matching(iob_list, ent_type_list):
biluo_tags = []
# there are '' in this group
type_indices = [[e[0] for e in d[1]] for d in groupby(enumerate(ent_type_list), key=operator.itemgetter(1))]
for index_group in type_indices:
length = len(index_group)
if length == 1:
# assign O or U
index = index_group[0]
if ent_type_list[index] !='':
biluo_tags.append(('U', ent_type_list[index]))
else:
biluo_tags.append(('O', ent_type_list[index]))
else:
biluo_tags.extend([(label_type(index_group.index(index), length), ent_type_list[index], ) for index in index_group])
return biluo_tags
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment