Created
November 3, 2021 17:46
-
-
Save shwars/580b55684be3328eb39ecf01b9cbbd88 to your computer and use it in GitHub Desktop.
Blog:Convert text from BC5CDR format into BIO-encoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_ner(rec,len_limit=4096): | |
txt,anns = rec | |
tokens = tokenizer.tokenize(txt) | |
#anns.sort(key=lambda x: x[0]) | |
p = 0 | |
prev = 'O' | |
res = [] | |
txt = txt.lower() | |
for t in tokens: | |
if len(res)>=len_limit: | |
break; | |
if t[0]!='#': | |
p = txt.find(t,p) | |
if len(anns)>0 and p>=anns[0][0] and p<anns[0][1]: | |
if prev==anns[0][3]: | |
res.append('I-'+annot_map[anns[0][3]]) | |
else: | |
res.append('B-'+annot_map[anns[0][3]]) | |
prev=anns[0][3] | |
else: | |
res.append('O') | |
prev='O' | |
if len(anns)>0 and anns[0][1]<p: | |
anns=anns[1:] | |
if t[0]=='#': | |
p+=len(t)-2 | |
if len(tokens)>len(res): | |
tokens = tokens[:len(res)] | |
return txt,tokens,res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment