Skip to content

Instantly share code, notes, and snippets.

@khaledadrani
Created January 17, 2022 13:38
Show Gist options
  • Save khaledadrani/64be1c980c394649b58cd1a7eaecbd75 to your computer and use it in GitHub Desktop.
Save khaledadrani/64be1c980c394649b58cd1a7eaecbd75 to your computer and use it in GitHub Desktop.
def import_documents_set_iob(train_file_path):
with open(train_file_path, encoding="utf8") as f:
tokens_in_file = f.readlines()
# construct list of list train set format
new_train_set = []
for index_token,token in enumerate(tokens_in_file):
# detect new document
is_new_document = False
if token == '-DOCSTART- -X- O O\n':
# So, there's a new document
is_new_document = True
document = []
else:
# A document is a set (triplets) of token name, POS token, tag token
split_token = token.split("\t")
try :
document.append((split_token[0],split_token[1].rstrip()))
except:
#print ("except :",split_token)
pass
try:
# if end of document, we store the document in th train set
if (tokens_in_file[index_token+1] == '-DOCSTART- -X- O O\n' ):
new_train_set.append(document)
except:
# detect the end of file or the end of all tokens in all documents in train set
if (index_token== (len(tokens_in_file) - 1)) :
new_train_set.append(document)
pass
return new_train_set
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment