Skip to content

Instantly share code, notes, and snippets.

@konverner
Created December 11, 2022 00:57
Show Gist options
  • Save konverner/8c3a8deb3cde5c08c512a5b678189a96 to your computer and use it in GitHub Desktop.
Save konverner/8c3a8deb3cde5c08c512a5b678189a96 to your computer and use it in GitHub Desktop.
convert conll2003 format for NER annotation into dataset
"""
A O
spokesman O
for O
Israel B-ORG
civil I-ORG
administration I-ORG
Samuel B-PER
Graham I-PER
said O
it O
->
X = [['A', 'spokesman', 'for', 'Israel', 'civil', 'administration'], ['Samuel', 'Graham', 'said', 'it']]
Y = [['O', 'O', 'O', 'B-LOC', 'O', 'O'], ['B-PER', 'I-PER', 'O', 'O']]
idx2label = ['O', B-PER, 'I-PER', 'B-ORG', 'I-ORG']
"""
def create_dataset(path):
with open(path) as f:
sents = f.read().strip('\n').split('\n\n')
X, Y = [], []
idx2label = []
for i in range(len(sents)):
tokens_labels = sents[i].split('\n')
tokens, labels = [], []
for j in range(len(tokens_labels)):
try:
token, label = tokens_labels[j].split(' ')
tokens.append(token)
labels.append(label)
if label not in idx2label:
idx2label.append(label)
except:
print(f"error: skip line: {tokens_labels[j]}")
X.append(tokens)
Y.append(labels)
return X, Y, idx2label
X, Y, idx2label = create_dataset(PATH)
print(f"{len(X)} sentences with {sum([len(X[i]) for i in range(len(X))])} tokens in total")
print(f"labels: {idx2label}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment