-
-
Save ttpro1995/cd8c60cfc72416a02713bb93dff9ae6f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
END_TAG = 0 | |
START_TAG = 1 | |
NOT_TAG = -1 | |
def detect_tag(in_token): | |
""" | |
detect tag in token | |
:param in_token: | |
:return: | |
""" | |
if "<START:" in in_token: | |
return START_TAG | |
elif "<END>" == in_token: | |
return END_TAG | |
return NOT_TAG | |
def remove_nest_tag(in_str): | |
""" | |
với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END> | |
:param in_str: | |
:return: | |
""" | |
state = 0 | |
taglist = [] | |
tag_dict = dict() | |
sentence_token = in_str.split() | |
## detect token tag | |
max_nest = 0 | |
for index, token in enumerate(sentence_token): | |
# print(token + str(detect_tag(token))) | |
tag = detect_tag(token) | |
if tag > 0: | |
state += 1 | |
if max_nest < state: | |
max_nest = state | |
token_info = (index, state, token) | |
taglist.append(token_info) | |
tag_dict[index] = token_info | |
elif tag == 0: | |
token_info = (index, state, token) | |
taglist.append(token_info) | |
tag_dict[index] = token_info | |
state -= 1 | |
generate_sentences = [] | |
for state in range(max_nest+1): | |
generate_sentence_token = [] | |
for index, token in enumerate(sentence_token): | |
if detect_tag(token) >= 0: # is a tag | |
token_info = tag_dict[index] | |
if token_info[1] == state: | |
generate_sentence_token.append(token) | |
elif detect_tag(token) == -1 : # not a tag | |
generate_sentence_token.append(token) | |
sentence = ' '.join(generate_sentence_token) | |
generate_sentences.append(sentence) | |
return generate_sentences | |
# generate sentence | |
print(taglist) | |
def test(): | |
tstr = "với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>" | |
result = remove_nest_tag(tstr) | |
print("-----") | |
for sentence in result: | |
print(sentence) | |
if __name__ == "__main__": | |
""" | |
un-nest dataset for opennlp name | |
""" | |
# test() | |
if len(sys.argv) > 1: | |
inpath = sys.argv[1] | |
infile = open(inpath, 'r') | |
outfile = open(inpath+".out", 'w') | |
for line in infile: | |
sentences = remove_nest_tag(line) | |
for sentence in sentences: | |
outfile.write(sentence+"\n") | |
outfile.close() | |
else: | |
print("usage: python unnest_data.py input.txt") | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment