Skip to content

Instantly share code, notes, and snippets.

Last active January 27, 2021 07:13
Show Gist options
  • Save liangfu/954c032adf25273bb15f5706ed2f3013 to your computer and use it in GitHub Desktop.
Save liangfu/954c032adf25273bb15f5706ed2f3013 to your computer and use it in GitHub Desktop.
Convert origindata.txt in bosonnlp to CoNLL format for named entity recognition (NER).
import random
def bosonnlp_to_bio2(origfile, trainfile, valfile):
val_ratio = 0.2
traindata = []
valdata = []
with open(origfile, 'rt') as fp:
lines = fp.readlines()
val_samples = int(len(lines) * val_ratio)
val_lines = lines[:val_samples]
train_lines = lines[val_samples:]
def transform(line):
# print(line)
it = 0
document = ""
annotations = []
while True:
start = line.find("{{", it)
end = line.find("}}", start)
next_start = line.find("{{", end)
if end < 0:
# print(start, end)
labeltext = line[start+2:end]
loc = labeltext.find(":")
label = labeltext[:loc]
text = labeltext[loc+1:]
# label, text = line[start+2:end].split(":")
prefix = line[it:start]
if next_start > 0:
suffix = line[end+2:next_start]
suffix = line[end+2:]
tic = len(prefix) + len(document)
toc = len(prefix) + len(document) + len(text)
annotations.append([tic, toc, label])
document += prefix + text + suffix
it = next_start
document = document.replace(' ', '_')
document = document.replace(',', ',')
document = document.replace('“', '"')
document = document.replace('”', '"')
document = document.replace(':', ':')
document = document.replace('(', '(')
document = document.replace(')', ')')
document = document.replace('\t', '_')
return annotations, document
documents = []
def strip_suffix(tag):
if tag.lower().endswith("name"):
tag = tag[:-4]
return tag.upper().strip("-_")
with open(trainfile, 'w') as fp:
for idx, line in enumerate(train_lines):
annotations, document = transform(line)
# print(annotations)
# print(document)
document = document.strip()
count = 0
for i, c in enumerate(document):
label = "O"
for a in annotations:
if i >= a[0] and i < a[1]:
label = "I-"+strip_suffix(a[2])
fp.write("{} X X {}\n".format(c, label))
# limit sequence length to 128 - 2
if (count % 125) == 124 or c in ["。", ";"]:
count = 0
count += 1
with open(valfile, 'w') as fp:
for idx, line in enumerate(val_lines):
annotations, document = transform(line)
# print(annotations)
document = document.strip()
count = 0
for i, c in enumerate(document):
label = "O"
for a in annotations:
if i >= a[0] and i < a[1]:
label = "I-"+strip_suffix(a[2])
fp.write("{} X X {}\n".format(c, label))
# limit sequence length to 128 - 2
if (count % 125) == 124 or c in ["。", ";"]:
count = 0
count += 1
with open("documents.txt", "w") as fp:
bosonnlp_to_bio2('origindata.txt', 'train.txt', 'val.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment