Created
January 17, 2022 13:38
-
-
Save khaledadrani/64be1c980c394649b58cd1a7eaecbd75 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def import_documents_set_iob(train_file_path): | |
with open(train_file_path, encoding="utf8") as f: | |
tokens_in_file = f.readlines() | |
# construct list of list train set format | |
new_train_set = [] | |
for index_token,token in enumerate(tokens_in_file): | |
# detect new document | |
is_new_document = False | |
if token == '-DOCSTART- -X- O O\n': | |
# So, there's a new document | |
is_new_document = True | |
document = [] | |
else: | |
# A document is a set (triplets) of token name, POS token, tag token | |
split_token = token.split("\t") | |
try : | |
document.append((split_token[0],split_token[1].rstrip())) | |
except: | |
#print ("except :",split_token) | |
pass | |
try: | |
# if end of document, we store the document in th train set | |
if (tokens_in_file[index_token+1] == '-DOCSTART- -X- O O\n' ): | |
new_train_set.append(document) | |
except: | |
# detect the end of file or the end of all tokens in all documents in train set | |
if (index_token== (len(tokens_in_file) - 1)) : | |
new_train_set.append(document) | |
pass | |
return new_train_set |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment