Skip to content

Instantly share code, notes, and snippets.

@shwars
Created November 3, 2021 17:46
Show Gist options
  • Save shwars/580b55684be3328eb39ecf01b9cbbd88 to your computer and use it in GitHub Desktop.
Save shwars/580b55684be3328eb39ecf01b9cbbd88 to your computer and use it in GitHub Desktop.
Blog:Convert text from BC5CDR format into BIO-encoding
def get_ner(rec,len_limit=4096):
txt,anns = rec
tokens = tokenizer.tokenize(txt)
#anns.sort(key=lambda x: x[0])
p = 0
prev = 'O'
res = []
txt = txt.lower()
for t in tokens:
if len(res)>=len_limit:
break;
if t[0]!='#':
p = txt.find(t,p)
if len(anns)>0 and p>=anns[0][0] and p<anns[0][1]:
if prev==anns[0][3]:
res.append('I-'+annot_map[anns[0][3]])
else:
res.append('B-'+annot_map[anns[0][3]])
prev=anns[0][3]
else:
res.append('O')
prev='O'
if len(anns)>0 and anns[0][1]<p:
anns=anns[1:]
if t[0]=='#':
p+=len(t)-2
if len(tokens)>len(res):
tokens = tokens[:len(res)]
return txt,tokens,res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment