Skip to content

Instantly share code, notes, and snippets.

@ttpro1995
Created July 25, 2018 03:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ttpro1995/cd8c60cfc72416a02713bb93dff9ae6f to your computer and use it in GitHub Desktop.
Save ttpro1995/cd8c60cfc72416a02713bb93dff9ae6f to your computer and use it in GitHub Desktop.
import sys
END_TAG = 0
START_TAG = 1
NOT_TAG = -1
def detect_tag(in_token):
"""
detect tag in token
:param in_token:
:return:
"""
if "<START:" in in_token:
return START_TAG
elif "<END>" == in_token:
return END_TAG
return NOT_TAG
def remove_nest_tag(in_str):
"""
với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>
:param in_str:
:return:
"""
state = 0
taglist = []
tag_dict = dict()
sentence_token = in_str.split()
## detect token tag
max_nest = 0
for index, token in enumerate(sentence_token):
# print(token + str(detect_tag(token)))
tag = detect_tag(token)
if tag > 0:
state += 1
if max_nest < state:
max_nest = state
token_info = (index, state, token)
taglist.append(token_info)
tag_dict[index] = token_info
elif tag == 0:
token_info = (index, state, token)
taglist.append(token_info)
tag_dict[index] = token_info
state -= 1
generate_sentences = []
for state in range(max_nest+1):
generate_sentence_token = []
for index, token in enumerate(sentence_token):
if detect_tag(token) >= 0: # is a tag
token_info = tag_dict[index]
if token_info[1] == state:
generate_sentence_token.append(token)
elif detect_tag(token) == -1 : # not a tag
generate_sentence_token.append(token)
sentence = ' '.join(generate_sentence_token)
generate_sentences.append(sentence)
return generate_sentences
# generate sentence
print(taglist)
def test():
tstr = "với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>"
result = remove_nest_tag(tstr)
print("-----")
for sentence in result:
print(sentence)
if __name__ == "__main__":
"""
un-nest dataset for opennlp name
"""
# test()
if len(sys.argv) > 1:
inpath = sys.argv[1]
infile = open(inpath, 'r')
outfile = open(inpath+".out", 'w')
for line in infile:
sentences = remove_nest_tag(line)
for sentence in sentences:
outfile.write(sentence+"\n")
outfile.close()
else:
print("usage: python unnest_data.py input.txt")
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment