khaledadrani/import_document_set_iob.py

## import_document_set_iob.py
def import_documents_set_iob(train_file_path):
    with open(train_file_path,  encoding="utf8") as f:
        tokens_in_file = f.readlines()

    # construct list of list train set format
    new_train_set = []

    for index_token,token in enumerate(tokens_in_file):
        # detect new document
        is_new_document = False
        if token == '-DOCSTART- -X- O O\n':

            # So, there's a new document
            is_new_document = True
            document = []
        else:
            # A document is a set (triplets) of token name, POS token, tag token
            split_token = token.split("\t")
            try :
                document.append((split_token[0],split_token[1].rstrip()))
            except:
                #print ("except :",split_token)
                pass

            try:
                # if end of document, we store the document in th train set
                if (tokens_in_file[index_token+1] == '-DOCSTART- -X- O O\n' ):
                    new_train_set.append(document)

            except:
                # detect the end of file or the end of all tokens in all documents in train set
                if (index_token== (len(tokens_in_file) - 1)) :
                    new_train_set.append(document)

                pass

    return new_train_set
	def import_documents_set_iob(train_file_path):
	with open(train_file_path, encoding="utf8") as f:
	tokens_in_file = f.readlines()

	# construct list of list train set format
	new_train_set = []

	for index_token,token in enumerate(tokens_in_file):
	# detect new document
	is_new_document = False
	if token == '-DOCSTART- -X- O O\n':

	# So, there's a new document
	is_new_document = True
	document = []
	else:
	# A document is a set (triplets) of token name, POS token, tag token
	split_token = token.split("\t")
	try :
	document.append((split_token[0],split_token[1].rstrip()))
	except:
	#print ("except :",split_token)
	pass

	try:
	# if end of document, we store the document in th train set
	if (tokens_in_file[index_token+1] == '-DOCSTART- -X- O O\n' ):
	new_train_set.append(document)

	except:
	# detect the end of file or the end of all tokens in all documents in train set
	if (index_token== (len(tokens_in_file) - 1)) :
	new_train_set.append(document)

	pass

	return new_train_set